xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <8 x i32> @stack_fold_valignd_ymm(<8 x i32> %a, <8 x i32> %b) {
13; CHECK-LABEL: stack_fold_valignd_ymm:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17; CHECK-NEXT:    #APP
18; CHECK-NEXT:    nop
19; CHECK-NEXT:    #NO_APP
20; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
21; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
22; CHECK-NEXT:    # ymm0 = mem[1,2,3,4,5,6,7],ymm0[0]
23; CHECK-NEXT:    retq
24  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
25  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
26  ret <8 x i32> %2
27}
28
29define <8 x i32> @stack_fold_valignd_ymm_mask(<8 x i32> %a, <8 x i32> %b, ptr %passthru, i8 %mask) {
30; CHECK-LABEL: stack_fold_valignd_ymm_mask:
31; CHECK:       # %bb.0:
32; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
33; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
34; CHECK-NEXT:    #APP
35; CHECK-NEXT:    nop
36; CHECK-NEXT:    #NO_APP
37; CHECK-NEXT:    kmovd %esi, %k1
38; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
39; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
40; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload
41; CHECK-NEXT:    # ymm1 {%k1} = mem[1,2,3,4,5,6,7],ymm0[0]
42; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
43; CHECK-NEXT:    retq
44  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
45  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
46  %3 = bitcast i8 %mask to <8 x i1>
47  %4 = load <8 x i32>, ptr %passthru
48  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
49  ret <8 x i32> %5
50}
51
52define <8 x i32> @stack_fold_valignd_ymm_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
53; CHECK-LABEL: stack_fold_valignd_ymm_maskz:
54; CHECK:       # %bb.0:
55; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
56; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
57; CHECK-NEXT:    #APP
58; CHECK-NEXT:    nop
59; CHECK-NEXT:    #NO_APP
60; CHECK-NEXT:    kmovd %edi, %k1
61; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
62; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
63; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7],ymm0[0]
64; CHECK-NEXT:    retq
65  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
66  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
67  %3 = bitcast i8 %mask to <8 x i1>
68  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
69  ret <8 x i32> %4
70}
71
72define <4 x i64> @stack_fold_valignq_ymm(<4 x i64> %a, <4 x i64> %b) {
73; CHECK-LABEL: stack_fold_valignq_ymm:
74; CHECK:       # %bb.0:
75; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
76; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
77; CHECK-NEXT:    #APP
78; CHECK-NEXT:    nop
79; CHECK-NEXT:    #NO_APP
80; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
81; CHECK-NEXT:    valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
82; CHECK-NEXT:    # ymm0 = mem[1,2,3],ymm0[0]
83; CHECK-NEXT:    retq
84  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
85  %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
86  ret <4 x i64> %2
87}
88
89define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
90; CHECK-LABEL: stack_fold_pavgb:
91; CHECK:       # %bb.0:
92; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
93; CHECK-NEXT:    #APP
94; CHECK-NEXT:    nop
95; CHECK-NEXT:    #NO_APP
96; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
97; CHECK-NEXT:    retq
98  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
99  %2 = zext <16 x i8> %a0 to <16 x i16>
100  %3 = zext <16 x i8> %a1 to <16 x i16>
101  %4 = add <16 x i16> %2, %3
102  %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
103  %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
104  %7 = trunc <16 x i16> %6 to <16 x i8>
105  ret <16 x i8> %7
106}
107
108define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
109; CHECK-LABEL: stack_fold_pavgb_ymm:
110; CHECK:       # %bb.0:
111; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
112; CHECK-NEXT:    #APP
113; CHECK-NEXT:    nop
114; CHECK-NEXT:    #NO_APP
115; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
116; CHECK-NEXT:    retq
117  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
118  %2 = zext <32 x i8> %a0 to <32 x i16>
119  %3 = zext <32 x i8> %a1 to <32 x i16>
120  %4 = add <32 x i16> %2, %3
121  %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
122  %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
123  %7 = trunc <32 x i16> %6 to <32 x i8>
124  ret <32 x i8> %7
125}
126
127define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
128; CHECK-LABEL: stack_fold_pavgw:
129; CHECK:       # %bb.0:
130; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
131; CHECK-NEXT:    #APP
132; CHECK-NEXT:    nop
133; CHECK-NEXT:    #NO_APP
134; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
135; CHECK-NEXT:    retq
136  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
137  %2 = zext <8 x i16> %a0 to <8 x i32>
138  %3 = zext <8 x i16> %a1 to <8 x i32>
139  %4 = add <8 x i32> %2, %3
140  %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
141  %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
142  %7 = trunc <8 x i32> %6 to <8 x i16>
143  ret <8 x i16> %7
144}
145
146define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
147; CHECK-LABEL: stack_fold_pavgw_ymm:
148; CHECK:       # %bb.0:
149; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
150; CHECK-NEXT:    #APP
151; CHECK-NEXT:    nop
152; CHECK-NEXT:    #NO_APP
153; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
154; CHECK-NEXT:    retq
155  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
156  %2 = zext <16 x i16> %a0 to <16 x i32>
157  %3 = zext <16 x i16> %a1 to <16 x i32>
158  %4 = add <16 x i32> %2, %3
159  %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
160  %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
161  %7 = trunc <16 x i32> %6 to <16 x i16>
162  ret <16 x i16> %7
163}
164
165define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) {
166; CHECK-LABEL: stack_fold_vpconflictd:
167; CHECK:       # %bb.0:
168; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
169; CHECK-NEXT:    #APP
170; CHECK-NEXT:    nop
171; CHECK-NEXT:    #NO_APP
172; CHECK-NEXT:    vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
173; CHECK-NEXT:    retq
174  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
175  %2 = call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %a0)
176  ret <4 x i32> %2
177}
178
179define <8 x i32> @stack_fold_vpconflictd_ymm(<8 x i32> %a0) {
180; CHECK-LABEL: stack_fold_vpconflictd_ymm:
181; CHECK:       # %bb.0:
182; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
183; CHECK-NEXT:    #APP
184; CHECK-NEXT:    nop
185; CHECK-NEXT:    #NO_APP
186; CHECK-NEXT:    vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
187; CHECK-NEXT:    retq
188  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
189  %2 = call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %a0)
190  ret <8 x i32> %2
191}
192
193define <2 x i64> @stack_fold_vpconflictq(<2 x i64> %a0) {
194; CHECK-LABEL: stack_fold_vpconflictq:
195; CHECK:       # %bb.0:
196; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
197; CHECK-NEXT:    #APP
198; CHECK-NEXT:    nop
199; CHECK-NEXT:    #NO_APP
200; CHECK-NEXT:    vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
201; CHECK-NEXT:    retq
202  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
203  %2 = call <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %a0)
204  ret <2 x i64> %2
205}
206
207define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) {
208; CHECK-LABEL: stack_fold_vpconflictq_ymm:
209; CHECK:       # %bb.0:
210; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
211; CHECK-NEXT:    #APP
212; CHECK-NEXT:    nop
213; CHECK-NEXT:    #NO_APP
214; CHECK-NEXT:    vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
215; CHECK-NEXT:    retq
216  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
217  %2 = call <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %a0)
218  ret <4 x i64> %2
219}
220
221define <4 x i32> @stack_fold_extracti32x4(<8 x i16> %a0, <8 x i32> %a1) {
222; CHECK-LABEL: stack_fold_extracti32x4:
223; CHECK:       # %bb.0:
224; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
225; CHECK-NEXT:    vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
226; CHECK-NEXT:    #APP
227; CHECK-NEXT:    nop
228; CHECK-NEXT:    #NO_APP
229; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
230; CHECK-NEXT:    vzeroupper
231; CHECK-NEXT:    retq
232  ; zext forces execution domain
233  %1 = zext <8 x i16> %a0 to <8 x i32>
234  %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
235  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
236  ret <4 x i32> %2
237}
238
239define <2 x i64> @stack_fold_extracti64x2(<4 x i32> %a0, <4 x i64> %a1) {
240; CHECK-LABEL: stack_fold_extracti64x2:
241; CHECK:       # %bb.0:
242; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
243; CHECK-NEXT:    vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
244; CHECK-NEXT:    #APP
245; CHECK-NEXT:    nop
246; CHECK-NEXT:    #NO_APP
247; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
248; CHECK-NEXT:    vzeroupper
249; CHECK-NEXT:    retq
250  ; zext forces execution domain
251  %1 = zext <4 x i32> %a0 to <4 x i64>
252  %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3>
253  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
254  ret <2 x i64> %2
255}
256
257define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) {
258; CHECK-LABEL: stack_fold_inserti32x4:
259; CHECK:       # %bb.0:
260; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
261; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
262; CHECK-NEXT:    #APP
263; CHECK-NEXT:    nop
264; CHECK-NEXT:    #NO_APP
265; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
266; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
267; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
268; CHECK-NEXT:    retq
269  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
270  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
271  ; add forces execution domain
272  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
273  ret <8 x i32> %3
274}
275
276define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) {
277; CHECK-LABEL: stack_fold_inserti64x2:
278; CHECK:       # %bb.0:
279; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
280; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
281; CHECK-NEXT:    #APP
282; CHECK-NEXT:    nop
283; CHECK-NEXT:    #NO_APP
284; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
285; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
286; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
287; CHECK-NEXT:    retq
288  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
289  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
290  ; add forces execution domain
291  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
292  ret <4 x i64> %3
293}
294
295define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
296; CHECK-LABEL: stack_fold_pabsb:
297; CHECK:       # %bb.0:
298; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
299; CHECK-NEXT:    #APP
300; CHECK-NEXT:    nop
301; CHECK-NEXT:    #NO_APP
302; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
303; CHECK-NEXT:    retq
304  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
305  %2 = icmp sgt <16 x i8> %a0, zeroinitializer
306  %3 = sub <16 x i8> zeroinitializer, %a0
307  %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3
308  ret <16 x i8> %4
309}
310
311define <32 x i8> @stack_fold_pabsb_ymm(<32 x i8> %a0) {
312; CHECK-LABEL: stack_fold_pabsb_ymm:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
315; CHECK-NEXT:    #APP
316; CHECK-NEXT:    nop
317; CHECK-NEXT:    #NO_APP
318; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
319; CHECK-NEXT:    retq
320  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
321  %2 = icmp sgt <32 x i8> %a0, zeroinitializer
322  %3 = sub <32 x i8> zeroinitializer, %a0
323  %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3
324  ret <32 x i8> %4
325}
326
327define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
328; CHECK-LABEL: stack_fold_pabsd:
329; CHECK:       # %bb.0:
330; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
331; CHECK-NEXT:    #APP
332; CHECK-NEXT:    nop
333; CHECK-NEXT:    #NO_APP
334; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
335; CHECK-NEXT:    retq
336  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
337  %2 = icmp sgt <4 x i32> %a0, zeroinitializer
338  %3 = sub <4 x i32> zeroinitializer, %a0
339  %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3
340  ret <4 x i32> %4
341}
342
343define <8 x i32> @stack_fold_pabsd_ymm(<8 x i32> %a0) {
344; CHECK-LABEL: stack_fold_pabsd_ymm:
345; CHECK:       # %bb.0:
346; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
347; CHECK-NEXT:    #APP
348; CHECK-NEXT:    nop
349; CHECK-NEXT:    #NO_APP
350; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
351; CHECK-NEXT:    retq
352  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
353  %2 = icmp sgt <8 x i32> %a0, zeroinitializer
354  %3 = sub <8 x i32> zeroinitializer, %a0
355  %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3
356  ret <8 x i32> %4
357}
358
359define <2 x i64> @stack_fold_pabsq(<2 x i64> %a0) {
360; CHECK-LABEL: stack_fold_pabsq:
361; CHECK:       # %bb.0:
362; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
363; CHECK-NEXT:    #APP
364; CHECK-NEXT:    nop
365; CHECK-NEXT:    #NO_APP
366; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
367; CHECK-NEXT:    retq
368  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
369  %2 = icmp sgt <2 x i64> %a0, zeroinitializer
370  %3 = sub <2 x i64> zeroinitializer, %a0
371  %4 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %3
372  ret <2 x i64> %4
373}
374
375define <4 x i64> @stack_fold_pabsq_ymm(<4 x i64> %a0) {
376; CHECK-LABEL: stack_fold_pabsq_ymm:
377; CHECK:       # %bb.0:
378; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
379; CHECK-NEXT:    #APP
380; CHECK-NEXT:    nop
381; CHECK-NEXT:    #NO_APP
382; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
383; CHECK-NEXT:    retq
384  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
385  %2 = icmp sgt <4 x i64> %a0, zeroinitializer
386  %3 = sub <4 x i64> zeroinitializer, %a0
387  %4 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %3
388  ret <4 x i64> %4
389}
390
391define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
392; CHECK-LABEL: stack_fold_pabsw:
393; CHECK:       # %bb.0:
394; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
395; CHECK-NEXT:    #APP
396; CHECK-NEXT:    nop
397; CHECK-NEXT:    #NO_APP
398; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
399; CHECK-NEXT:    retq
400  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
401  %2 = icmp sgt <8 x i16> %a0, zeroinitializer
402  %3 = sub <8 x i16> zeroinitializer, %a0
403  %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3
404  ret <8 x i16> %4
405}
406
407define <16 x i16> @stack_fold_pabsw_ymm(<16 x i16> %a0) {
408; CHECK-LABEL: stack_fold_pabsw_ymm:
409; CHECK:       # %bb.0:
410; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
411; CHECK-NEXT:    #APP
412; CHECK-NEXT:    nop
413; CHECK-NEXT:    #NO_APP
414; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
415; CHECK-NEXT:    retq
416  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
417  %2 = icmp sgt <16 x i16> %a0, zeroinitializer
418  %3 = sub <16 x i16> zeroinitializer, %a0
419  %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3
420  ret <16 x i16> %4
421}
422
423define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
424; CHECK-LABEL: stack_fold_packssdw:
425; CHECK:       # %bb.0:
426; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
427; CHECK-NEXT:    #APP
428; CHECK-NEXT:    nop
429; CHECK-NEXT:    #NO_APP
430; CHECK-NEXT:    vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
431; CHECK-NEXT:    retq
432  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
433  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
434  ret <8 x i16> %2
435}
436declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
437
438define <16 x i16> @stack_fold_packssdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
439; CHECK-LABEL: stack_fold_packssdw_ymm:
440; CHECK:       # %bb.0:
441; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
442; CHECK-NEXT:    #APP
443; CHECK-NEXT:    nop
444; CHECK-NEXT:    #NO_APP
445; CHECK-NEXT:    vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
446; CHECK-NEXT:    retq
447  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
448  %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
449  ret <16 x i16> %2
450}
451declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
452
453define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
454; CHECK-LABEL: stack_fold_packsswb:
455; CHECK:       # %bb.0:
456; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
457; CHECK-NEXT:    #APP
458; CHECK-NEXT:    nop
459; CHECK-NEXT:    #NO_APP
460; CHECK-NEXT:    vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
461; CHECK-NEXT:    retq
462  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
463  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
464  ret <16 x i8> %2
465}
466declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
467
468define <32 x i8> @stack_fold_packsswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
469; CHECK-LABEL: stack_fold_packsswb_ymm:
470; CHECK:       # %bb.0:
471; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
472; CHECK-NEXT:    #APP
473; CHECK-NEXT:    nop
474; CHECK-NEXT:    #NO_APP
475; CHECK-NEXT:    vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
476; CHECK-NEXT:    retq
477  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
478  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
479  ret <32 x i8> %2
480}
481declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
482
483define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
484; CHECK-LABEL: stack_fold_packusdw:
485; CHECK:       # %bb.0:
486; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
487; CHECK-NEXT:    #APP
488; CHECK-NEXT:    nop
489; CHECK-NEXT:    #NO_APP
490; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
491; CHECK-NEXT:    retq
492  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
493  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
494  ret <8 x i16> %2
495}
496declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
497
498define <16 x i16> @stack_fold_packusdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
499; CHECK-LABEL: stack_fold_packusdw_ymm:
500; CHECK:       # %bb.0:
501; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
502; CHECK-NEXT:    #APP
503; CHECK-NEXT:    nop
504; CHECK-NEXT:    #NO_APP
505; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
506; CHECK-NEXT:    retq
507  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
508  %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
509  ret <16 x i16> %2
510}
511declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
512
513define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
514; CHECK-LABEL: stack_fold_packuswb:
515; CHECK:       # %bb.0:
516; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
517; CHECK-NEXT:    #APP
518; CHECK-NEXT:    nop
519; CHECK-NEXT:    #NO_APP
520; CHECK-NEXT:    vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
521; CHECK-NEXT:    retq
522  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
523  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
524  ret <16 x i8> %2
525}
526declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
527
528define <32 x i8> @stack_fold_packuswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
529; CHECK-LABEL: stack_fold_packuswb_ymm:
530; CHECK:       # %bb.0:
531; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
532; CHECK-NEXT:    #APP
533; CHECK-NEXT:    nop
534; CHECK-NEXT:    #NO_APP
535; CHECK-NEXT:    vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
536; CHECK-NEXT:    retq
537  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
538  %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
539  ret <32 x i8> %2
540}
541declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
542
543define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
544; CHECK-LABEL: stack_fold_paddb:
545; CHECK:       # %bb.0:
546; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
547; CHECK-NEXT:    #APP
548; CHECK-NEXT:    nop
549; CHECK-NEXT:    #NO_APP
550; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
551; CHECK-NEXT:    retq
552  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
553  %2 = add <16 x i8> %a0, %a1
554  ret <16 x i8> %2
555}
556
557define <16 x i8> @stack_fold_paddb_mask(<16 x i8> %a0, <16 x i8> %a1, ptr %a2, i16 %mask) {
558; CHECK-LABEL: stack_fold_paddb_mask:
559; CHECK:       # %bb.0:
560; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
561; CHECK-NEXT:    #APP
562; CHECK-NEXT:    nop
563; CHECK-NEXT:    #NO_APP
564; CHECK-NEXT:    kmovd %esi, %k1
565; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
566; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
567; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
568; CHECK-NEXT:    retq
569  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
570  %2 = add <16 x i8> %a0, %a1
571  %3 = bitcast i16 %mask to <16 x i1>
572  ; load needed to keep the operation from being scheduled about the asm block
573  %4 = load <16 x i8>, ptr %a2
574  %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4
575  ret <16 x i8> %5
576}
577
578define <16 x i8> @stack_fold_paddb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
579; CHECK-LABEL: stack_fold_paddb_maskz:
580; CHECK:       # %bb.0:
581; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
582; CHECK-NEXT:    #APP
583; CHECK-NEXT:    nop
584; CHECK-NEXT:    #NO_APP
585; CHECK-NEXT:    kmovd %edi, %k1
586; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
587; CHECK-NEXT:    retq
588  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
589  %2 = add <16 x i8> %a0, %a1
590  %3 = bitcast i16 %mask to <16 x i1>
591  %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer
592  ret <16 x i8> %4
593}
594
595define <32 x i8> @stack_fold_paddb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
596; CHECK-LABEL: stack_fold_paddb_ymm:
597; CHECK:       # %bb.0:
598; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
599; CHECK-NEXT:    #APP
600; CHECK-NEXT:    nop
601; CHECK-NEXT:    #NO_APP
602; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
603; CHECK-NEXT:    retq
604  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
605  %2 = add <32 x i8> %a0, %a1
606  ret <32 x i8> %2
607}
608
609define <32 x i8> @stack_fold_paddb_mask_ymm(<32 x i8> %a0, <32 x i8> %a1, ptr %a2, i32 %mask) {
610; CHECK-LABEL: stack_fold_paddb_mask_ymm:
611; CHECK:       # %bb.0:
612; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
613; CHECK-NEXT:    #APP
614; CHECK-NEXT:    nop
615; CHECK-NEXT:    #NO_APP
616; CHECK-NEXT:    kmovd %esi, %k1
617; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
618; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
619; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
620; CHECK-NEXT:    retq
621  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
622  %2 = add <32 x i8> %a0, %a1
623  %3 = bitcast i32 %mask to <32 x i1>
624  ; load needed to keep the operation from being scheduled about the asm block
625  %4 = load <32 x i8>, ptr %a2
626  %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
627  ret <32 x i8> %5
628}
629
630define <32 x i8> @stack_fold_paddb_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
631; CHECK-LABEL: stack_fold_paddb_maskz_ymm:
632; CHECK:       # %bb.0:
633; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
634; CHECK-NEXT:    #APP
635; CHECK-NEXT:    nop
636; CHECK-NEXT:    #NO_APP
637; CHECK-NEXT:    kmovd %edi, %k1
638; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
639; CHECK-NEXT:    retq
640  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
641  %2 = add <32 x i8> %a0, %a1
642  %3 = bitcast i32 %mask to <32 x i1>
643  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
644  ret <32 x i8> %4
645}
646
647define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
648; CHECK-LABEL: stack_fold_paddd:
649; CHECK:       # %bb.0:
650; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
651; CHECK-NEXT:    #APP
652; CHECK-NEXT:    nop
653; CHECK-NEXT:    #NO_APP
654; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
655; CHECK-NEXT:    retq
656  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
657  %2 = add <4 x i32> %a0, %a1
658  ret <4 x i32> %2
659}
660
661define <8 x i32> @stack_fold_paddd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
662; CHECK-LABEL: stack_fold_paddd_ymm:
663; CHECK:       # %bb.0:
664; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
665; CHECK-NEXT:    #APP
666; CHECK-NEXT:    nop
667; CHECK-NEXT:    #NO_APP
668; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
669; CHECK-NEXT:    retq
670  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
671  %2 = add <8 x i32> %a0, %a1
672  ret <8 x i32> %2
673}
674
675define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
676; CHECK-LABEL: stack_fold_paddq:
677; CHECK:       # %bb.0:
678; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
679; CHECK-NEXT:    #APP
680; CHECK-NEXT:    nop
681; CHECK-NEXT:    #NO_APP
682; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
683; CHECK-NEXT:    retq
684  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
685  %2 = add <2 x i64> %a0, %a1
686  ret <2 x i64> %2
687}
688
689define <4 x i64> @stack_fold_paddq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
690; CHECK-LABEL: stack_fold_paddq_ymm:
691; CHECK:       # %bb.0:
692; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
693; CHECK-NEXT:    #APP
694; CHECK-NEXT:    nop
695; CHECK-NEXT:    #NO_APP
696; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
697; CHECK-NEXT:    retq
698  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
699  %2 = add <4 x i64> %a0, %a1
700  ret <4 x i64> %2
701}
702
703define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
704; CHECK-LABEL: stack_fold_paddsb:
705; CHECK:       # %bb.0:
706; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
707; CHECK-NEXT:    #APP
708; CHECK-NEXT:    nop
709; CHECK-NEXT:    #NO_APP
710; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
711; CHECK-NEXT:    retq
712  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
713  %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
714  ret <16 x i8> %2
715}
716declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
717
718define <32 x i8> @stack_fold_paddsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
719; CHECK-LABEL: stack_fold_paddsb_ymm:
720; CHECK:       # %bb.0:
721; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
722; CHECK-NEXT:    #APP
723; CHECK-NEXT:    nop
724; CHECK-NEXT:    #NO_APP
725; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
726; CHECK-NEXT:    retq
727  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
728  %2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
729  ret <32 x i8> %2
730}
731declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
732
733define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
734; CHECK-LABEL: stack_fold_paddsw:
735; CHECK:       # %bb.0:
736; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
737; CHECK-NEXT:    #APP
738; CHECK-NEXT:    nop
739; CHECK-NEXT:    #NO_APP
740; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
741; CHECK-NEXT:    retq
742  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
743  %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
744  ret <8 x i16> %2
745}
746declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
747
748define <16 x i16> @stack_fold_paddsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
749; CHECK-LABEL: stack_fold_paddsw_ymm:
750; CHECK:       # %bb.0:
751; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
752; CHECK-NEXT:    #APP
753; CHECK-NEXT:    nop
754; CHECK-NEXT:    #NO_APP
755; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
756; CHECK-NEXT:    retq
757  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
758  %2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
759  ret <16 x i16> %2
760}
761declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
762
763define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
764; CHECK-LABEL: stack_fold_paddusb:
765; CHECK:       # %bb.0:
766; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
767; CHECK-NEXT:    #APP
768; CHECK-NEXT:    nop
769; CHECK-NEXT:    #NO_APP
770; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
771; CHECK-NEXT:    retq
772  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
773  %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
774  ret <16 x i8> %2
775}
776declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
777
778define <32 x i8> @stack_fold_paddusb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
779; CHECK-LABEL: stack_fold_paddusb_ymm:
780; CHECK:       # %bb.0:
781; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
782; CHECK-NEXT:    #APP
783; CHECK-NEXT:    nop
784; CHECK-NEXT:    #NO_APP
785; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
786; CHECK-NEXT:    retq
787  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
788  %2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
789  ret <32 x i8> %2
790}
791declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
792
793define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
794; CHECK-LABEL: stack_fold_paddusw:
795; CHECK:       # %bb.0:
796; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
797; CHECK-NEXT:    #APP
798; CHECK-NEXT:    nop
799; CHECK-NEXT:    #NO_APP
800; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
801; CHECK-NEXT:    retq
802  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
803  %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
804  ret <8 x i16> %2
805}
806declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
807
808define <16 x i16> @stack_fold_paddusw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
809; CHECK-LABEL: stack_fold_paddusw_ymm:
810; CHECK:       # %bb.0:
811; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
812; CHECK-NEXT:    #APP
813; CHECK-NEXT:    nop
814; CHECK-NEXT:    #NO_APP
815; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
816; CHECK-NEXT:    retq
817  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
818  %2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
819  ret <16 x i16> %2
820}
821declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
822
823define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
824; CHECK-LABEL: stack_fold_paddw:
825; CHECK:       # %bb.0:
826; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
827; CHECK-NEXT:    #APP
828; CHECK-NEXT:    nop
829; CHECK-NEXT:    #NO_APP
830; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
831; CHECK-NEXT:    retq
832  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
833  %2 = add <8 x i16> %a0, %a1
834  ret <8 x i16> %2
835}
836
837define <16 x i16> @stack_fold_paddw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
838; CHECK-LABEL: stack_fold_paddw_ymm:
839; CHECK:       # %bb.0:
840; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
841; CHECK-NEXT:    #APP
842; CHECK-NEXT:    nop
843; CHECK-NEXT:    #NO_APP
844; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
845; CHECK-NEXT:    retq
846  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
847  %2 = add <16 x i16> %a0, %a1
848  ret <16 x i16> %2
849}
850
851define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) {
852; CHECK-LABEL: stack_fold_palignr:
853; CHECK:       # %bb.0:
854; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
855; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
856; CHECK-NEXT:    #APP
857; CHECK-NEXT:    nop
858; CHECK-NEXT:    #NO_APP
859; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
860; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
861; CHECK-NEXT:    # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
862; CHECK-NEXT:    retq
863  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
864  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
865  ret <32 x i8> %2
866}
867
868define <32 x i8> @stack_fold_palignr_mask(<32 x i8> %a0, <32 x i8> %a1, ptr %passthru, i32 %mask) {
869; CHECK-LABEL: stack_fold_palignr_mask:
870; CHECK:       # %bb.0:
871; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
872; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
873; CHECK-NEXT:    #APP
874; CHECK-NEXT:    nop
875; CHECK-NEXT:    #NO_APP
876; CHECK-NEXT:    kmovd %esi, %k1
877; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
878; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
879; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload
880; CHECK-NEXT:    # ymm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
881; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
882; CHECK-NEXT:    retq
883  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
884  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
885  %3 = bitcast i32 %mask to <32 x i1>
886  %4 = load <32 x i8>, ptr %passthru
887  %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
888  ret <32 x i8> %5
889}
890
891define <32 x i8> @stack_fold_palignr_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
892; CHECK-LABEL: stack_fold_palignr_maskz:
893; CHECK:       # %bb.0:
894; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
895; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
896; CHECK-NEXT:    #APP
897; CHECK-NEXT:    nop
898; CHECK-NEXT:    #NO_APP
899; CHECK-NEXT:    kmovd %edi, %k1
900; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
901; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
902; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
903; CHECK-NEXT:    retq
904  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
905  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
906  %3 = bitcast i32 %mask to <32 x i1>
907  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
908  ret <32 x i8> %4
909}
910
911define i16 @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
912; CHECK-LABEL: stack_fold_pcmpeqb:
913; CHECK:       # %bb.0:
914; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
915; CHECK-NEXT:    #APP
916; CHECK-NEXT:    nop
917; CHECK-NEXT:    #NO_APP
918; CHECK-NEXT:    vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
919; CHECK-NEXT:    kmovd %k0, %eax
920; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
921; CHECK-NEXT:    retq
922  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
923  %2 = icmp eq <16 x i8> %a0, %a1
924  %3 = bitcast <16 x i1> %2 to i16
925  ret i16 %3
926}
927
928define i8 @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
929; CHECK-LABEL: stack_fold_pcmpeqd:
930; CHECK:       # %bb.0:
931; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
932; CHECK-NEXT:    #APP
933; CHECK-NEXT:    nop
934; CHECK-NEXT:    #NO_APP
935; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
936; CHECK-NEXT:    kmovd %k0, %eax
937; CHECK-NEXT:    # kill: def $al killed $al killed $eax
938; CHECK-NEXT:    retq
939  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
940  %2 = icmp eq <4 x i32> %a0, %a1
941  %3 = shufflevector <4 x i1> %2, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
942  %4 = bitcast <8 x i1> %3 to i8
943  ret i8 %4
944}
945
946define i8 @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
947; CHECK-LABEL: stack_fold_pcmpeqq:
948; CHECK:       # %bb.0:
949; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
950; CHECK-NEXT:    #APP
951; CHECK-NEXT:    nop
952; CHECK-NEXT:    #NO_APP
953; CHECK-NEXT:    vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
954; CHECK-NEXT:    kmovd %k0, %eax
955; CHECK-NEXT:    # kill: def $al killed $al killed $eax
956; CHECK-NEXT:    retq
957  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
958  %2 = icmp eq <2 x i64> %a0, %a1
959  %3 = shufflevector <2 x i1> %2, <2 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
960  %4 = bitcast <8 x i1> %3 to i8
961  ret i8 %4
962}
963
964define i8 @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
965; CHECK-LABEL: stack_fold_pcmpeqw:
966; CHECK:       # %bb.0:
967; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
968; CHECK-NEXT:    #APP
969; CHECK-NEXT:    nop
970; CHECK-NEXT:    #NO_APP
971; CHECK-NEXT:    vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
972; CHECK-NEXT:    kmovd %k0, %eax
973; CHECK-NEXT:    # kill: def $al killed $al killed $eax
974; CHECK-NEXT:    retq
975  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
976  %2 = icmp eq <8 x i16> %a0, %a1
977  %3 = bitcast <8 x i1> %2 to i8
978  ret i8 %3
979}
980
981define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) {
982; CHECK-LABEL: stack_fold_permbvar:
983; CHECK:       # %bb.0:
984; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
985; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
986; CHECK-NEXT:    #APP
987; CHECK-NEXT:    nop
988; CHECK-NEXT:    #NO_APP
989; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
990; CHECK-NEXT:    vpermb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
991; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
992; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
993; CHECK-NEXT:    retq
994  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
995  %2 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0)
996  ; add forces execution domain
997  %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
998  ret <32 x i8> %3
999}
1000declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) nounwind readonly
1001
1002define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
1003; CHECK-LABEL: stack_fold_permd:
1004; CHECK:       # %bb.0:
1005; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1006; CHECK-NEXT:    #APP
1007; CHECK-NEXT:    nop
1008; CHECK-NEXT:    #NO_APP
1009; CHECK-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1010; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1011; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
1012; CHECK-NEXT:    retq
1013  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1014  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
1015  ; add forces execution domain
1016  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1017  ret <8 x i32> %3
1018}
1019declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
1020
1021define <16 x i8> @stack_fold_vpermi2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
1022; CHECK-LABEL: stack_fold_vpermi2b:
1023; CHECK:       # %bb.0:
1024; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1025; CHECK-NEXT:    #APP
1026; CHECK-NEXT:    nop
1027; CHECK-NEXT:    #NO_APP
1028; CHECK-NEXT:    vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1029; CHECK-NEXT:    retq
1030  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1031  %2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2)
1032  ret <16 x i8> %2
1033}
1034
1035define <32 x i8> @stack_fold_vpermi2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
1036; CHECK-LABEL: stack_fold_vpermi2b_ymm:
1037; CHECK:       # %bb.0:
1038; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1039; CHECK-NEXT:    #APP
1040; CHECK-NEXT:    nop
1041; CHECK-NEXT:    #NO_APP
1042; CHECK-NEXT:    vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1043; CHECK-NEXT:    retq
1044  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1045  %2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2)
1046  ret <32 x i8> %2
1047}
1048
1049define <4 x i32> @stack_fold_vpermi2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
1050; CHECK-LABEL: stack_fold_vpermi2d:
1051; CHECK:       # %bb.0:
1052; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1053; CHECK-NEXT:    #APP
1054; CHECK-NEXT:    nop
1055; CHECK-NEXT:    #NO_APP
1056; CHECK-NEXT:    vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1057; CHECK-NEXT:    retq
1058  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1059  %2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2)
1060  ret <4 x i32> %2
1061}
1062
1063define <8 x i32> @stack_fold_vpermi2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
1064; CHECK-LABEL: stack_fold_vpermi2d_ymm:
1065; CHECK:       # %bb.0:
1066; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1067; CHECK-NEXT:    #APP
1068; CHECK-NEXT:    nop
1069; CHECK-NEXT:    #NO_APP
1070; CHECK-NEXT:    vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1071; CHECK-NEXT:    retq
1072  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1073  %2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2)
1074  ret <8 x i32> %2
1075}
1076
1077define <2 x i64> @stack_fold_vpermi2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
1078; CHECK-LABEL: stack_fold_vpermi2q:
1079; CHECK:       # %bb.0:
1080; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1081; CHECK-NEXT:    #APP
1082; CHECK-NEXT:    nop
1083; CHECK-NEXT:    #NO_APP
1084; CHECK-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1085; CHECK-NEXT:    retq
1086  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1087  %2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2)
1088  ret <2 x i64> %2
1089}
1090
1091define <4 x i64> @stack_fold_vpermi2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
1092; CHECK-LABEL: stack_fold_vpermi2q_ymm:
1093; CHECK:       # %bb.0:
1094; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1095; CHECK-NEXT:    #APP
1096; CHECK-NEXT:    nop
1097; CHECK-NEXT:    #NO_APP
1098; CHECK-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1099; CHECK-NEXT:    retq
1100  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1101  %2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2)
1102  ret <4 x i64> %2
1103}
1104
1105define <8 x i16> @stack_fold_vpermi2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
1106; CHECK-LABEL: stack_fold_vpermi2w:
1107; CHECK:       # %bb.0:
1108; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1109; CHECK-NEXT:    #APP
1110; CHECK-NEXT:    nop
1111; CHECK-NEXT:    #NO_APP
1112; CHECK-NEXT:    vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1113; CHECK-NEXT:    retq
1114  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1115  %2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2)
1116  ret <8 x i16> %2
1117}
1118
1119define <16 x i16> @stack_fold_vpermi2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
1120; CHECK-LABEL: stack_fold_vpermi2w_ymm:
1121; CHECK:       # %bb.0:
1122; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1123; CHECK-NEXT:    #APP
1124; CHECK-NEXT:    nop
1125; CHECK-NEXT:    #NO_APP
1126; CHECK-NEXT:    vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1127; CHECK-NEXT:    retq
1128  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1129  %2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2)
1130  ret <16 x i16> %2
1131}
1132
1133define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
1134; CHECK-LABEL: stack_fold_permq:
1135; CHECK:       # %bb.0:
1136; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1137; CHECK-NEXT:    #APP
1138; CHECK-NEXT:    nop
1139; CHECK-NEXT:    #NO_APP
1140; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1141; CHECK-NEXT:    # ymm0 = mem[3,2,2,3]
1142; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1143; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
1144; CHECK-NEXT:    retq
1145  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1146  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
1147  ; add forces execution domain
1148  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
1149  ret <4 x i64> %3
1150}
1151
1152define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) {
1153; CHECK-LABEL: stack_fold_permqvar:
1154; CHECK:       # %bb.0:
1155; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1156; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1157; CHECK-NEXT:    #APP
1158; CHECK-NEXT:    nop
1159; CHECK-NEXT:    #NO_APP
1160; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1161; CHECK-NEXT:    vpermq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1162; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1163; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
1164; CHECK-NEXT:    retq
1165  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1166  %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0)
1167  ; add forces execution domain
1168  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
1169  ret <4 x i64> %3
1170}
1171declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) nounwind readonly
1172
1173define <16 x i8> @stack_fold_vpermt2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
1174; CHECK-LABEL: stack_fold_vpermt2b:
1175; CHECK:       # %bb.0:
1176; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1177; CHECK-NEXT:    #APP
1178; CHECK-NEXT:    nop
1179; CHECK-NEXT:    #NO_APP
1180; CHECK-NEXT:    vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1181; CHECK-NEXT:    retq
1182  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1183  %2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2)
1184  ret <16 x i8> %2
1185}
1186declare <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>)
1187
1188define <32 x i8> @stack_fold_vpermt2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
1189; CHECK-LABEL: stack_fold_vpermt2b_ymm:
1190; CHECK:       # %bb.0:
1191; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1192; CHECK-NEXT:    #APP
1193; CHECK-NEXT:    nop
1194; CHECK-NEXT:    #NO_APP
1195; CHECK-NEXT:    vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1196; CHECK-NEXT:    retq
1197  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1198  %2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2)
1199  ret <32 x i8> %2
1200}
1201declare <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>)
1202
1203define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
1204; CHECK-LABEL: stack_fold_vpermt2d:
1205; CHECK:       # %bb.0:
1206; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1207; CHECK-NEXT:    #APP
1208; CHECK-NEXT:    nop
1209; CHECK-NEXT:    #NO_APP
1210; CHECK-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1211; CHECK-NEXT:    retq
1212  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1213  %2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
1214  ret <4 x i32> %2
1215}
1216declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
1217
1218define <8 x i32> @stack_fold_vpermt2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
1219; CHECK-LABEL: stack_fold_vpermt2d_ymm:
1220; CHECK:       # %bb.0:
1221; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1222; CHECK-NEXT:    #APP
1223; CHECK-NEXT:    nop
1224; CHECK-NEXT:    #NO_APP
1225; CHECK-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1226; CHECK-NEXT:    retq
1227  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1228  %2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
1229  ret <8 x i32> %2
1230}
1231declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
1232
1233define <2 x i64> @stack_fold_vpermt2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
1234; CHECK-LABEL: stack_fold_vpermt2q:
1235; CHECK:       # %bb.0:
1236; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1237; CHECK-NEXT:    #APP
1238; CHECK-NEXT:    nop
1239; CHECK-NEXT:    #NO_APP
1240; CHECK-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1241; CHECK-NEXT:    retq
1242  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1243  %2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
1244  ret <2 x i64> %2
1245}
1246declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
1247
1248define <4 x i64> @stack_fold_vpermt2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
1249; CHECK-LABEL: stack_fold_vpermt2q_ymm:
1250; CHECK:       # %bb.0:
1251; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1252; CHECK-NEXT:    #APP
1253; CHECK-NEXT:    nop
1254; CHECK-NEXT:    #NO_APP
1255; CHECK-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1256; CHECK-NEXT:    retq
1257  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1258  %2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
1259  ret <4 x i64> %2
1260}
1261declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
1262
1263define <8 x i16> @stack_fold_vpermt2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
1264; CHECK-LABEL: stack_fold_vpermt2w:
1265; CHECK:       # %bb.0:
1266; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1267; CHECK-NEXT:    #APP
1268; CHECK-NEXT:    nop
1269; CHECK-NEXT:    #NO_APP
1270; CHECK-NEXT:    vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1271; CHECK-NEXT:    retq
1272  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1273  %2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2)
1274  ret <8 x i16> %2
1275}
1276declare <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>)
1277
1278define <16 x i16> @stack_fold_vpermt2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
1279; CHECK-LABEL: stack_fold_vpermt2w_ymm:
1280; CHECK:       # %bb.0:
1281; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1282; CHECK-NEXT:    #APP
1283; CHECK-NEXT:    nop
1284; CHECK-NEXT:    #NO_APP
1285; CHECK-NEXT:    vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1286; CHECK-NEXT:    retq
1287  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1288  %2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2)
1289  ret <16 x i16> %2
1290}
1291declare <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>)
1292
1293define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) {
1294; CHECK-LABEL: stack_fold_permwvar:
1295; CHECK:       # %bb.0:
1296; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1297; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1298; CHECK-NEXT:    #APP
1299; CHECK-NEXT:    nop
1300; CHECK-NEXT:    #NO_APP
1301; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1302; CHECK-NEXT:    vpermw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1303; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1304; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1305; CHECK-NEXT:    retq
1306  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1307  %2 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0)
1308  ; add forces execution domain
1309  %3 = add <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1310  ret <16 x i16> %3
1311}
1312declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) nounwind readonly
1313
1314define <4 x i32> @stack_fold_vplzcntd(<4 x i32> %a0) {
1315; CHECK-LABEL: stack_fold_vplzcntd:
1316; CHECK:       # %bb.0:
1317; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1318; CHECK-NEXT:    #APP
1319; CHECK-NEXT:    nop
1320; CHECK-NEXT:    #NO_APP
1321; CHECK-NEXT:    vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1322; CHECK-NEXT:    retq
1323  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1324  %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a0, i1 false)
1325  ret <4 x i32> %2
1326}
1327
1328define <8 x i32> @stack_fold_vplzcntd_ymm(<8 x i32> %a0) {
1329; CHECK-LABEL: stack_fold_vplzcntd_ymm:
1330; CHECK:       # %bb.0:
1331; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1332; CHECK-NEXT:    #APP
1333; CHECK-NEXT:    nop
1334; CHECK-NEXT:    #NO_APP
1335; CHECK-NEXT:    vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1336; CHECK-NEXT:    retq
1337  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1338  %2 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a0, i1 false)
1339  ret <8 x i32> %2
1340}
1341
1342define <2 x i64> @stack_fold_vplzcntq(<2 x i64> %a0) {
1343; CHECK-LABEL: stack_fold_vplzcntq:
1344; CHECK:       # %bb.0:
1345; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1346; CHECK-NEXT:    #APP
1347; CHECK-NEXT:    nop
1348; CHECK-NEXT:    #NO_APP
1349; CHECK-NEXT:    vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1350; CHECK-NEXT:    retq
1351  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1352  %2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a0, i1 false)
1353  ret <2 x i64> %2
1354}
1355
1356define <4 x i64> @stack_fold_vplzcntq_ymm(<4 x i64> %a0) {
1357; CHECK-LABEL: stack_fold_vplzcntq_ymm:
1358; CHECK:       # %bb.0:
1359; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1360; CHECK-NEXT:    #APP
1361; CHECK-NEXT:    nop
1362; CHECK-NEXT:    #NO_APP
1363; CHECK-NEXT:    vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1364; CHECK-NEXT:    retq
1365  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1366  %2 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a0, i1 false)
1367  ret <4 x i64> %2
1368}
1369
1370define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
1371; CHECK-LABEL: stack_fold_pmaddubsw:
1372; CHECK:       # %bb.0:
1373; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1374; CHECK-NEXT:    #APP
1375; CHECK-NEXT:    nop
1376; CHECK-NEXT:    #NO_APP
1377; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1378; CHECK-NEXT:    retq
1379  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1380  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1381  ret <8 x i16> %2
1382}
1383declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
1384
1385define <8 x i16> @stack_fold_pmaddubsw_mask(ptr %passthru, <16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
1386; CHECK-LABEL: stack_fold_pmaddubsw_mask:
1387; CHECK:       # %bb.0:
1388; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1389; CHECK-NEXT:    #APP
1390; CHECK-NEXT:    nop
1391; CHECK-NEXT:    #NO_APP
1392; CHECK-NEXT:    kmovd %esi, %k1
1393; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1394; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1395; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
1396; CHECK-NEXT:    retq
1397  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1398  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1399  %3 = bitcast i8 %mask to <8 x i1>
1400  ; load needed to keep the operation from being scheduled about the asm block
1401  %4 = load <8 x i16>, ptr %passthru
1402  %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4
1403  ret <8 x i16> %5
1404}
1405
1406define <8 x i16> @stack_fold_pmaddubsw_maskz(<16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
1407; CHECK-LABEL: stack_fold_pmaddubsw_maskz:
1408; CHECK:       # %bb.0:
1409; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1410; CHECK-NEXT:    #APP
1411; CHECK-NEXT:    nop
1412; CHECK-NEXT:    #NO_APP
1413; CHECK-NEXT:    kmovd %edi, %k1
1414; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1415; CHECK-NEXT:    retq
1416  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1417  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1418  %3 = bitcast i8 %mask to <8 x i1>
1419  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
1420  ret <8 x i16> %4
1421}
1422
1423define <16 x i16> @stack_fold_pmaddubsw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
1424; CHECK-LABEL: stack_fold_pmaddubsw_ymm:
1425; CHECK:       # %bb.0:
1426; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1427; CHECK-NEXT:    #APP
1428; CHECK-NEXT:    nop
1429; CHECK-NEXT:    #NO_APP
1430; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1431; CHECK-NEXT:    retq
1432  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1433  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
1434  ret <16 x i16> %2
1435}
1436declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1437
1438define <16 x i16> @stack_fold_pmaddubsw_ymm_mask(ptr %passthru, <32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
1439; CHECK-LABEL: stack_fold_pmaddubsw_ymm_mask:
1440; CHECK:       # %bb.0:
1441; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1442; CHECK-NEXT:    #APP
1443; CHECK-NEXT:    nop
1444; CHECK-NEXT:    #NO_APP
1445; CHECK-NEXT:    kmovd %esi, %k1
1446; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
1447; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1448; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1449; CHECK-NEXT:    retq
1450  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1451  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
1452  %3 = bitcast i16 %mask to <16 x i1>
1453  ; load needed to keep the operation from being scheduled about the asm block
1454  %4 = load <16 x i16>, ptr %passthru
1455  %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4
1456  ret <16 x i16> %5
1457}
1458
1459define <16 x i16> @stack_fold_pmaddubsw_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
1460; CHECK-LABEL: stack_fold_pmaddubsw_ymm_maskz:
1461; CHECK:       # %bb.0:
1462; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1463; CHECK-NEXT:    #APP
1464; CHECK-NEXT:    nop
1465; CHECK-NEXT:    #NO_APP
1466; CHECK-NEXT:    kmovd %edi, %k1
1467; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
1468; CHECK-NEXT:    retq
1469  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1470  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
1471  %3 = bitcast i16 %mask to <16 x i1>
1472  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
1473  ret <16 x i16> %4
1474}
1475
1476define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
1477; CHECK-LABEL: stack_fold_pmaddwd:
1478; CHECK:       # %bb.0:
1479; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1480; CHECK-NEXT:    #APP
1481; CHECK-NEXT:    nop
1482; CHECK-NEXT:    #NO_APP
1483; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1484; CHECK-NEXT:    retq
1485  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1486  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1487  ret <4 x i32> %2
1488}
1489declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
1490
1491define <4 x i32> @stack_fold_pmaddwd_mask(ptr %passthru, <8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
1492; CHECK-LABEL: stack_fold_pmaddwd_mask:
1493; CHECK:       # %bb.0:
1494; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1495; CHECK-NEXT:    #APP
1496; CHECK-NEXT:    nop
1497; CHECK-NEXT:    #NO_APP
1498; CHECK-NEXT:    kmovd %esi, %k1
1499; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1500; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1501; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
1502; CHECK-NEXT:    retq
1503  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1504  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1505  %3 = bitcast i8 %mask to <8 x i1>
1506  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1507  ; load needed to keep the operation from being scheduled about the asm block
1508  %5 = load <4 x i32>, ptr %passthru
1509  %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %5
1510  ret <4 x i32> %6
1511}
1512
1513define <4 x i32> @stack_fold_pmaddwd_maskz(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
1514; CHECK-LABEL: stack_fold_pmaddwd_maskz:
1515; CHECK:       # %bb.0:
1516; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1517; CHECK-NEXT:    #APP
1518; CHECK-NEXT:    nop
1519; CHECK-NEXT:    #NO_APP
1520; CHECK-NEXT:    kmovd %edi, %k1
1521; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1522; CHECK-NEXT:    retq
1523  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1524  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1525  %3 = bitcast i8 %mask to <8 x i1>
1526  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1527  %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
1528  ret <4 x i32> %5
1529}
1530
1531define <8 x i32> @stack_fold_pmaddwd_ymm(<16 x i16> %a0, <16 x i16> %a1) {
1532; CHECK-LABEL: stack_fold_pmaddwd_ymm:
1533; CHECK:       # %bb.0:
1534; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1535; CHECK-NEXT:    #APP
1536; CHECK-NEXT:    nop
1537; CHECK-NEXT:    #NO_APP
1538; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1539; CHECK-NEXT:    retq
1540  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1541  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
1542  ret <8 x i32> %2
1543}
1544declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1545
1546define <8 x i32> @stack_fold_pmaddwd_ymm_mask(ptr %passthru, <16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
1547; CHECK-LABEL: stack_fold_pmaddwd_ymm_mask:
1548; CHECK:       # %bb.0:
1549; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1550; CHECK-NEXT:    #APP
1551; CHECK-NEXT:    nop
1552; CHECK-NEXT:    #NO_APP
1553; CHECK-NEXT:    kmovd %esi, %k1
1554; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
1555; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1556; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1557; CHECK-NEXT:    retq
1558  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1559  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
1560  %3 = bitcast i8 %mask to <8 x i1>
1561  ; load needed to keep the operation from being scheduled about the asm block
1562  %4 = load <8 x i32>, ptr %passthru
1563  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
1564  ret <8 x i32> %5
1565}
1566
1567define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
1568; CHECK-LABEL: stack_fold_pmaddwd_ymm_maskz:
1569; CHECK:       # %bb.0:
1570; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1571; CHECK-NEXT:    #APP
1572; CHECK-NEXT:    nop
1573; CHECK-NEXT:    #NO_APP
1574; CHECK-NEXT:    kmovd %edi, %k1
1575; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
1576; CHECK-NEXT:    retq
1577  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1578  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
1579  %3 = bitcast i8 %mask to <8 x i1>
1580  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
1581  ret <8 x i32> %4
1582}
1583
1584define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
1585; CHECK-LABEL: stack_fold_pmaxsb:
1586; CHECK:       # %bb.0:
1587; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1588; CHECK-NEXT:    #APP
1589; CHECK-NEXT:    nop
1590; CHECK-NEXT:    #NO_APP
1591; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1592; CHECK-NEXT:    retq
1593  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1594  %2 = icmp sgt <16 x i8> %a0, %a1
1595  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1596  ret <16 x i8> %3
1597}
1598
1599define <32 x i8> @stack_fold_pmaxsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
1600; CHECK-LABEL: stack_fold_pmaxsb_ymm:
1601; CHECK:       # %bb.0:
1602; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1603; CHECK-NEXT:    #APP
1604; CHECK-NEXT:    nop
1605; CHECK-NEXT:    #NO_APP
1606; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1607; CHECK-NEXT:    retq
1608  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1609  %2 = icmp sgt <32 x i8> %a0, %a1
1610  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1611  ret <32 x i8> %3
1612}
1613
1614define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
1615; CHECK-LABEL: stack_fold_pmaxsd:
1616; CHECK:       # %bb.0:
1617; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1618; CHECK-NEXT:    #APP
1619; CHECK-NEXT:    nop
1620; CHECK-NEXT:    #NO_APP
1621; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1622; CHECK-NEXT:    retq
1623  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1624  %2 = icmp sgt <4 x i32> %a0, %a1
1625  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1626  ret <4 x i32> %3
1627}
1628
1629define <8 x i32> @stack_fold_pmaxsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1630; CHECK-LABEL: stack_fold_pmaxsd_ymm:
1631; CHECK:       # %bb.0:
1632; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1633; CHECK-NEXT:    #APP
1634; CHECK-NEXT:    nop
1635; CHECK-NEXT:    #NO_APP
1636; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1637; CHECK-NEXT:    retq
1638  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1639  %2 = icmp sgt <8 x i32> %a0, %a1
1640  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1641  ret <8 x i32> %3
1642}
1643
1644define <2 x i64> @stack_fold_pmaxsq(<2 x i64> %a0, <2 x i64> %a1) {
1645; CHECK-LABEL: stack_fold_pmaxsq:
1646; CHECK:       # %bb.0:
1647; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1648; CHECK-NEXT:    #APP
1649; CHECK-NEXT:    nop
1650; CHECK-NEXT:    #NO_APP
1651; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1652; CHECK-NEXT:    retq
1653  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1654  %2 = icmp sgt <2 x i64> %a0, %a1
1655  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
1656  ret <2 x i64> %3
1657}
1658
1659define <4 x i64> @stack_fold_pmaxsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1660; CHECK-LABEL: stack_fold_pmaxsq_ymm:
1661; CHECK:       # %bb.0:
1662; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1663; CHECK-NEXT:    #APP
1664; CHECK-NEXT:    nop
1665; CHECK-NEXT:    #NO_APP
1666; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1667; CHECK-NEXT:    retq
1668  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1669  %2 = icmp sgt <4 x i64> %a0, %a1
1670  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
1671  ret <4 x i64> %3
1672}
1673
1674define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
1675; CHECK-LABEL: stack_fold_pmaxsw:
1676; CHECK:       # %bb.0:
1677; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1678; CHECK-NEXT:    #APP
1679; CHECK-NEXT:    nop
1680; CHECK-NEXT:    #NO_APP
1681; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1682; CHECK-NEXT:    retq
1683  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1684  %2 = icmp sgt <8 x i16> %a0, %a1
1685  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1686  ret <8 x i16> %3
1687}
1688
1689define <16 x i16> @stack_fold_pmaxsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
1690; CHECK-LABEL: stack_fold_pmaxsw_ymm:
1691; CHECK:       # %bb.0:
1692; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1693; CHECK-NEXT:    #APP
1694; CHECK-NEXT:    nop
1695; CHECK-NEXT:    #NO_APP
1696; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1697; CHECK-NEXT:    retq
1698  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1699  %2 = icmp sgt <16 x i16> %a0, %a1
1700  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1701  ret <16 x i16> %3
1702}
1703
1704define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
1705; CHECK-LABEL: stack_fold_pmaxub:
1706; CHECK:       # %bb.0:
1707; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1708; CHECK-NEXT:    #APP
1709; CHECK-NEXT:    nop
1710; CHECK-NEXT:    #NO_APP
1711; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1712; CHECK-NEXT:    retq
1713  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1714  %2 = icmp ugt <16 x i8> %a0, %a1
1715  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1716  ret <16 x i8> %3
1717}
1718
1719define <32 x i8> @stack_fold_pmaxub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
1720; CHECK-LABEL: stack_fold_pmaxub_ymm:
1721; CHECK:       # %bb.0:
1722; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1723; CHECK-NEXT:    #APP
1724; CHECK-NEXT:    nop
1725; CHECK-NEXT:    #NO_APP
1726; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1727; CHECK-NEXT:    retq
1728  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1729  %2 = icmp ugt <32 x i8> %a0, %a1
1730  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1731  ret <32 x i8> %3
1732}
1733
1734define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
1735; CHECK-LABEL: stack_fold_pmaxud:
1736; CHECK:       # %bb.0:
1737; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1738; CHECK-NEXT:    #APP
1739; CHECK-NEXT:    nop
1740; CHECK-NEXT:    #NO_APP
1741; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1742; CHECK-NEXT:    retq
1743  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1744  %2 = icmp ugt <4 x i32> %a0, %a1
1745  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1746  ret <4 x i32> %3
1747}
1748
1749define <8 x i32> @stack_fold_pmaxud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1750; CHECK-LABEL: stack_fold_pmaxud_ymm:
1751; CHECK:       # %bb.0:
1752; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1753; CHECK-NEXT:    #APP
1754; CHECK-NEXT:    nop
1755; CHECK-NEXT:    #NO_APP
1756; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1757; CHECK-NEXT:    retq
1758  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1759  %2 = icmp ugt <8 x i32> %a0, %a1
1760  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1761  ret <8 x i32> %3
1762}
1763
1764define <2 x i64> @stack_fold_pmaxuq(<2 x i64> %a0, <2 x i64> %a1) {
1765; CHECK-LABEL: stack_fold_pmaxuq:
1766; CHECK:       # %bb.0:
1767; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1768; CHECK-NEXT:    #APP
1769; CHECK-NEXT:    nop
1770; CHECK-NEXT:    #NO_APP
1771; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1772; CHECK-NEXT:    retq
1773  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1774  %2 = icmp ugt <2 x i64> %a0, %a1
1775  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
1776  ret <2 x i64> %3
1777}
1778
1779define <2 x i64> @stack_fold_pmaxuq_mask(ptr %passthru, <2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1780; CHECK-LABEL: stack_fold_pmaxuq_mask:
1781; CHECK:       # %bb.0:
1782; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1783; CHECK-NEXT:    #APP
1784; CHECK-NEXT:    nop
1785; CHECK-NEXT:    #NO_APP
1786; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1787; CHECK-NEXT:    kmovd %esi, %k1
1788; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1789; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
1790; CHECK-NEXT:    retq
1791  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1792  %2 = load <2 x i64>, ptr %passthru
1793  %3 = icmp ugt <2 x i64> %a0, %a1
1794  %4 = select <2 x i1> %3, <2 x i64> %a0, <2 x i64> %a1
1795  %5 = bitcast i8 %mask to <8 x i1>
1796  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
1797  %6 = select <2 x i1> %extract, <2 x i64> %4, <2 x i64> %2
1798  ret <2 x i64> %6
1799}
1800
1801define <2 x i64> @stack_fold_pmaxuq_maskz(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1802; CHECK-LABEL: stack_fold_pmaxuq_maskz:
1803; CHECK:       # %bb.0:
1804; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1805; CHECK-NEXT:    #APP
1806; CHECK-NEXT:    nop
1807; CHECK-NEXT:    #NO_APP
1808; CHECK-NEXT:    kmovd %edi, %k1
1809; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1810; CHECK-NEXT:    retq
1811  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1812  %2 = icmp ugt <2 x i64> %a0, %a1
1813  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
1814  %4 = bitcast i8 %mask to <8 x i1>
1815  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> <i32 0, i32 1>
1816  %5 = select <2 x i1> %extract, <2 x i64> %3, <2 x i64> zeroinitializer
1817  ret <2 x i64> %5
1818}
1819
1820define <4 x i64> @stack_fold_pmaxuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1821; CHECK-LABEL: stack_fold_pmaxuq_ymm:
1822; CHECK:       # %bb.0:
1823; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1824; CHECK-NEXT:    #APP
1825; CHECK-NEXT:    nop
1826; CHECK-NEXT:    #NO_APP
1827; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1828; CHECK-NEXT:    retq
1829  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1830  %2 = icmp ugt <4 x i64> %a0, %a1
1831  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
1832  ret <4 x i64> %3
1833}
1834
1835define <4 x i64> @stack_fold_pmaxuq_ymm_mask(ptr %passthru, <4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
1836; CHECK-LABEL: stack_fold_pmaxuq_ymm_mask:
1837; CHECK:       # %bb.0:
1838; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1839; CHECK-NEXT:    #APP
1840; CHECK-NEXT:    nop
1841; CHECK-NEXT:    #NO_APP
1842; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
1843; CHECK-NEXT:    kmovd %esi, %k1
1844; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1845; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1846; CHECK-NEXT:    retq
1847  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1848  %2 = load <4 x i64>, ptr %passthru
1849  %3 = icmp ugt <4 x i64> %a0, %a1
1850  %4 = select <4 x i1> %3, <4 x i64> %a0, <4 x i64> %a1
1851  %5 = bitcast i8 %mask to <8 x i1>
1852  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1853  %6 = select <4 x i1> %extract, <4 x i64> %4, <4 x i64> %2
1854  ret <4 x i64> %6
1855}
1856
1857define <4 x i64> @stack_fold_pmaxuq_ymm_maskz(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
1858; CHECK-LABEL: stack_fold_pmaxuq_ymm_maskz:
1859; CHECK:       # %bb.0:
1860; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1861; CHECK-NEXT:    #APP
1862; CHECK-NEXT:    nop
1863; CHECK-NEXT:    #NO_APP
1864; CHECK-NEXT:    kmovd %edi, %k1
1865; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
1866; CHECK-NEXT:    retq
1867  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1868  %2 = icmp ugt <4 x i64> %a0, %a1
1869  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
1870  %4 = bitcast i8 %mask to <8 x i1>
1871  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1872  %5 = select <4 x i1> %extract, <4 x i64> %3, <4 x i64> zeroinitializer
1873  ret <4 x i64> %5
1874}
1875
1876define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
1877; CHECK-LABEL: stack_fold_pmaxuw:
1878; CHECK:       # %bb.0:
1879; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1880; CHECK-NEXT:    #APP
1881; CHECK-NEXT:    nop
1882; CHECK-NEXT:    #NO_APP
1883; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1884; CHECK-NEXT:    retq
1885  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1886  %2 = icmp ugt <8 x i16> %a0, %a1
1887  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1888  ret <8 x i16> %3
1889}
1890
1891define <16 x i16> @stack_fold_pmaxuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
1892; CHECK-LABEL: stack_fold_pmaxuw_ymm:
1893; CHECK:       # %bb.0:
1894; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1895; CHECK-NEXT:    #APP
1896; CHECK-NEXT:    nop
1897; CHECK-NEXT:    #NO_APP
1898; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1899; CHECK-NEXT:    retq
1900  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1901  %2 = icmp ugt <16 x i16> %a0, %a1
1902  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1903  ret <16 x i16> %3
1904}
1905
1906define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
1907; CHECK-LABEL: stack_fold_pminsb:
1908; CHECK:       # %bb.0:
1909; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1910; CHECK-NEXT:    #APP
1911; CHECK-NEXT:    nop
1912; CHECK-NEXT:    #NO_APP
1913; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1914; CHECK-NEXT:    retq
1915  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1916  %2 = icmp slt <16 x i8> %a0, %a1
1917  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1918  ret <16 x i8> %3
1919}
1920
1921define <32 x i8> @stack_fold_pminsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
1922; CHECK-LABEL: stack_fold_pminsb_ymm:
1923; CHECK:       # %bb.0:
1924; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1925; CHECK-NEXT:    #APP
1926; CHECK-NEXT:    nop
1927; CHECK-NEXT:    #NO_APP
1928; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1929; CHECK-NEXT:    retq
1930  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1931  %2 = icmp slt <32 x i8> %a0, %a1
1932  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1933  ret <32 x i8> %3
1934}
1935
1936define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
1937; CHECK-LABEL: stack_fold_pminsd:
1938; CHECK:       # %bb.0:
1939; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1940; CHECK-NEXT:    #APP
1941; CHECK-NEXT:    nop
1942; CHECK-NEXT:    #NO_APP
1943; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1944; CHECK-NEXT:    retq
1945  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1946  %2 = icmp slt <4 x i32> %a0, %a1
1947  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1948  ret <4 x i32> %3
1949}
1950
1951define <8 x i32> @stack_fold_pminsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1952; CHECK-LABEL: stack_fold_pminsd_ymm:
1953; CHECK:       # %bb.0:
1954; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1955; CHECK-NEXT:    #APP
1956; CHECK-NEXT:    nop
1957; CHECK-NEXT:    #NO_APP
1958; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1959; CHECK-NEXT:    retq
1960  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1961  %2 = icmp slt <8 x i32> %a0, %a1
1962  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1963  ret <8 x i32> %3
1964}
1965
1966define <2 x i64> @stack_fold_pminsq(<2 x i64> %a0, <2 x i64> %a1) {
1967; CHECK-LABEL: stack_fold_pminsq:
1968; CHECK:       # %bb.0:
1969; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1970; CHECK-NEXT:    #APP
1971; CHECK-NEXT:    nop
1972; CHECK-NEXT:    #NO_APP
1973; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1974; CHECK-NEXT:    retq
1975  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1976  %2 = icmp slt <2 x i64> %a0, %a1
1977  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
1978  ret <2 x i64> %3
1979}
1980
1981define <4 x i64> @stack_fold_pminsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1982; CHECK-LABEL: stack_fold_pminsq_ymm:
1983; CHECK:       # %bb.0:
1984; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1985; CHECK-NEXT:    #APP
1986; CHECK-NEXT:    nop
1987; CHECK-NEXT:    #NO_APP
1988; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1989; CHECK-NEXT:    retq
1990  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1991  %2 = icmp slt <4 x i64> %a0, %a1
1992  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
1993  ret <4 x i64> %3
1994}
1995
1996define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
1997; CHECK-LABEL: stack_fold_pminsw:
1998; CHECK:       # %bb.0:
1999; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2000; CHECK-NEXT:    #APP
2001; CHECK-NEXT:    nop
2002; CHECK-NEXT:    #NO_APP
2003; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2004; CHECK-NEXT:    retq
2005  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2006  %2 = icmp slt <8 x i16> %a0, %a1
2007  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
2008  ret <8 x i16> %3
2009}
2010
2011define <16 x i16> @stack_fold_pminsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
2012; CHECK-LABEL: stack_fold_pminsw_ymm:
2013; CHECK:       # %bb.0:
2014; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2015; CHECK-NEXT:    #APP
2016; CHECK-NEXT:    nop
2017; CHECK-NEXT:    #NO_APP
2018; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2019; CHECK-NEXT:    retq
2020  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2021  %2 = icmp slt <16 x i16> %a0, %a1
2022  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
2023  ret <16 x i16> %3
2024}
2025
2026define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
2027; CHECK-LABEL: stack_fold_pminub:
2028; CHECK:       # %bb.0:
2029; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2030; CHECK-NEXT:    #APP
2031; CHECK-NEXT:    nop
2032; CHECK-NEXT:    #NO_APP
2033; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2034; CHECK-NEXT:    retq
2035  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2036  %2 = icmp ult <16 x i8> %a0, %a1
2037  %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
2038  ret <16 x i8> %3
2039}
2040
2041define <32 x i8> @stack_fold_pminub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
2042; CHECK-LABEL: stack_fold_pminub_ymm:
2043; CHECK:       # %bb.0:
2044; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2045; CHECK-NEXT:    #APP
2046; CHECK-NEXT:    nop
2047; CHECK-NEXT:    #NO_APP
2048; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2049; CHECK-NEXT:    retq
2050  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2051  %2 = icmp ult <32 x i8> %a0, %a1
2052  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
2053  ret <32 x i8> %3
2054}
2055
2056define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
2057; CHECK-LABEL: stack_fold_pminud:
2058; CHECK:       # %bb.0:
2059; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2060; CHECK-NEXT:    #APP
2061; CHECK-NEXT:    nop
2062; CHECK-NEXT:    #NO_APP
2063; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2064; CHECK-NEXT:    retq
2065  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2066  %2 = icmp ult <4 x i32> %a0, %a1
2067  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
2068  ret <4 x i32> %3
2069}
2070
2071define <8 x i32> @stack_fold_pminud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
2072; CHECK-LABEL: stack_fold_pminud_ymm:
2073; CHECK:       # %bb.0:
2074; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2075; CHECK-NEXT:    #APP
2076; CHECK-NEXT:    nop
2077; CHECK-NEXT:    #NO_APP
2078; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2079; CHECK-NEXT:    retq
2080  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2081  %2 = icmp ult <8 x i32> %a0, %a1
2082  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
2083  ret <8 x i32> %3
2084}
2085
2086define <2 x i64> @stack_fold_pminuq(<2 x i64> %a0, <2 x i64> %a1) {
2087; CHECK-LABEL: stack_fold_pminuq:
2088; CHECK:       # %bb.0:
2089; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2090; CHECK-NEXT:    #APP
2091; CHECK-NEXT:    nop
2092; CHECK-NEXT:    #NO_APP
2093; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2094; CHECK-NEXT:    retq
2095  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2096  %2 = icmp ult <2 x i64> %a0, %a1
2097  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
2098  ret <2 x i64> %3
2099}
2100
2101define <4 x i64> @stack_fold_pminuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
2102; CHECK-LABEL: stack_fold_pminuq_ymm:
2103; CHECK:       # %bb.0:
2104; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2105; CHECK-NEXT:    #APP
2106; CHECK-NEXT:    nop
2107; CHECK-NEXT:    #NO_APP
2108; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2109; CHECK-NEXT:    retq
2110  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2111  %2 = icmp ult <4 x i64> %a0, %a1
2112  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
2113  ret <4 x i64> %3
2114}
2115
2116define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
2117; CHECK-LABEL: stack_fold_pminuw:
2118; CHECK:       # %bb.0:
2119; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2120; CHECK-NEXT:    #APP
2121; CHECK-NEXT:    nop
2122; CHECK-NEXT:    #NO_APP
2123; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2124; CHECK-NEXT:    retq
2125  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2126  %2 = icmp ult <8 x i16> %a0, %a1
2127  %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
2128  ret <8 x i16> %3
2129}
2130
2131define <16 x i16> @stack_fold_pminuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
2132; CHECK-LABEL: stack_fold_pminuw_ymm:
2133; CHECK:       # %bb.0:
2134; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2135; CHECK-NEXT:    #APP
2136; CHECK-NEXT:    nop
2137; CHECK-NEXT:    #NO_APP
2138; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2139; CHECK-NEXT:    retq
2140  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2141  %2 = icmp ult <16 x i16> %a0, %a1
2142  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
2143  ret <16 x i16> %3
2144}
2145
2146define <8 x i16> @stack_fold_vpmovdw(<8 x i32> %a0) {
2147; CHECK-LABEL: stack_fold_vpmovdw:
2148; CHECK:       # %bb.0:
2149; CHECK-NEXT:    vpmovdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2150; CHECK-NEXT:    #APP
2151; CHECK-NEXT:    nop
2152; CHECK-NEXT:    #NO_APP
2153; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2154; CHECK-NEXT:    vzeroupper
2155; CHECK-NEXT:    retq
2156  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
2157  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2158  ret <8 x i16> %1
2159}
2160declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
2161
2162define <4 x i32> @stack_fold_vpmovqd(<4 x i64> %a0) {
2163; CHECK-LABEL: stack_fold_vpmovqd:
2164; CHECK:       # %bb.0:
2165; CHECK-NEXT:    vpmovqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2166; CHECK-NEXT:    #APP
2167; CHECK-NEXT:    nop
2168; CHECK-NEXT:    #NO_APP
2169; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2170; CHECK-NEXT:    vzeroupper
2171; CHECK-NEXT:    retq
2172  %1 = trunc <4 x i64> %a0 to <4 x i32>
2173  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2174  ret <4 x i32> %1
2175}
2176declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
2177
2178define <16 x i8> @stack_fold_vpmovwb(<16 x i16> %a0) {
2179; CHECK-LABEL: stack_fold_vpmovwb:
2180; CHECK:       # %bb.0:
2181; CHECK-NEXT:    vpmovwb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2182; CHECK-NEXT:    #APP
2183; CHECK-NEXT:    nop
2184; CHECK-NEXT:    #NO_APP
2185; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2186; CHECK-NEXT:    vzeroupper
2187; CHECK-NEXT:    retq
2188  %1 = trunc <16 x i16> %a0 to <16 x i8>
2189  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2190  ret <16 x i8> %1
2191}
2192declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
2193
2194define <8 x i16> @stack_fold_vpmovsdw(<8 x i32> %a0) {
2195; CHECK-LABEL: stack_fold_vpmovsdw:
2196; CHECK:       # %bb.0:
2197; CHECK-NEXT:    vpmovsdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2198; CHECK-NEXT:    #APP
2199; CHECK-NEXT:    nop
2200; CHECK-NEXT:    #NO_APP
2201; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2202; CHECK-NEXT:    vzeroupper
2203; CHECK-NEXT:    retq
2204  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
2205  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2206  ret <8 x i16> %1
2207}
2208declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
2209
2210define <4 x i32> @stack_fold_vpmovsqd(<4 x i64> %a0) {
2211; CHECK-LABEL: stack_fold_vpmovsqd:
2212; CHECK:       # %bb.0:
2213; CHECK-NEXT:    vpmovsqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2214; CHECK-NEXT:    #APP
2215; CHECK-NEXT:    nop
2216; CHECK-NEXT:    #NO_APP
2217; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2218; CHECK-NEXT:    vzeroupper
2219; CHECK-NEXT:    retq
2220  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
2221  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2222  ret <4 x i32> %1
2223}
2224declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
2225
2226define <16 x i8> @stack_fold_vpmovswb(<16 x i16> %a0) {
2227; CHECK-LABEL: stack_fold_vpmovswb:
2228; CHECK:       # %bb.0:
2229; CHECK-NEXT:    vpmovswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2230; CHECK-NEXT:    #APP
2231; CHECK-NEXT:    nop
2232; CHECK-NEXT:    #NO_APP
2233; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2234; CHECK-NEXT:    vzeroupper
2235; CHECK-NEXT:    retq
2236  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
2237  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2238  ret <16 x i8> %1
2239}
2240declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
2241
2242define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
2243; CHECK-LABEL: stack_fold_pmovsxbd:
2244; CHECK:       # %bb.0:
2245; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2246; CHECK-NEXT:    #APP
2247; CHECK-NEXT:    nop
2248; CHECK-NEXT:    #NO_APP
2249; CHECK-NEXT:    vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2250; CHECK-NEXT:    retq
2251  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2252  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2253  %3 = sext <4 x i8> %2 to <4 x i32>
2254  ret <4 x i32> %3
2255}
2256
2257define <8 x i32> @stack_fold_pmovsxbd_ymm(<16 x i8> %a0) {
2258; CHECK-LABEL: stack_fold_pmovsxbd_ymm:
2259; CHECK:       # %bb.0:
2260; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2261; CHECK-NEXT:    #APP
2262; CHECK-NEXT:    nop
2263; CHECK-NEXT:    #NO_APP
2264; CHECK-NEXT:    vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2265; CHECK-NEXT:    retq
2266  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2267  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2268  %3 = sext <8 x i8> %2 to <8 x i32>
2269  ret <8 x i32> %3
2270}
2271
2272define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
2273; CHECK-LABEL: stack_fold_pmovsxbq:
2274; CHECK:       # %bb.0:
2275; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2276; CHECK-NEXT:    #APP
2277; CHECK-NEXT:    nop
2278; CHECK-NEXT:    #NO_APP
2279; CHECK-NEXT:    vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2280; CHECK-NEXT:    retq
2281  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2282  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
2283  %3 = sext <2 x i8> %2 to <2 x i64>
2284  ret <2 x i64> %3
2285}
2286
2287define <4 x i64> @stack_fold_pmovsxbq_ymm(<16 x i8> %a0) {
2288; CHECK-LABEL: stack_fold_pmovsxbq_ymm:
2289; CHECK:       # %bb.0:
2290; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2291; CHECK-NEXT:    #APP
2292; CHECK-NEXT:    nop
2293; CHECK-NEXT:    #NO_APP
2294; CHECK-NEXT:    vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2295; CHECK-NEXT:    retq
2296  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2297  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2298  %3 = sext <4 x i8> %2 to <4 x i64>
2299  ret <4 x i64> %3
2300}
2301
2302define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
2303; CHECK-LABEL: stack_fold_pmovsxbw:
2304; CHECK:       # %bb.0:
2305; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2306; CHECK-NEXT:    #APP
2307; CHECK-NEXT:    nop
2308; CHECK-NEXT:    #NO_APP
2309; CHECK-NEXT:    vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2310; CHECK-NEXT:    retq
2311  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2312  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2313  %3 = sext <8 x i8> %2 to <8 x i16>
2314  ret <8 x i16> %3
2315}
2316
2317define <16 x i16> @stack_fold_pmovsxbw_ymm(<16 x i8> %a0) {
2318; CHECK-LABEL: stack_fold_pmovsxbw_ymm:
2319; CHECK:       # %bb.0:
2320; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2321; CHECK-NEXT:    #APP
2322; CHECK-NEXT:    nop
2323; CHECK-NEXT:    #NO_APP
2324; CHECK-NEXT:    vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2325; CHECK-NEXT:    retq
2326  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2327  %2 = sext <16 x i8> %a0 to <16 x i16>
2328  ret <16 x i16> %2
2329}
2330
2331define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
2332; CHECK-LABEL: stack_fold_pmovsxdq:
2333; CHECK:       # %bb.0:
2334; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2335; CHECK-NEXT:    #APP
2336; CHECK-NEXT:    nop
2337; CHECK-NEXT:    #NO_APP
2338; CHECK-NEXT:    vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2339; CHECK-NEXT:    retq
2340  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2341  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2342  %3 = sext <2 x i32> %2 to <2 x i64>
2343  ret <2 x i64> %3
2344}
2345
2346define <4 x i64> @stack_fold_pmovsxdq_ymm(<4 x i32> %a0) {
2347; CHECK-LABEL: stack_fold_pmovsxdq_ymm:
2348; CHECK:       # %bb.0:
2349; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2350; CHECK-NEXT:    #APP
2351; CHECK-NEXT:    nop
2352; CHECK-NEXT:    #NO_APP
2353; CHECK-NEXT:    vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2354; CHECK-NEXT:    retq
2355  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2356  %2 = sext <4 x i32> %a0 to <4 x i64>
2357  ret <4 x i64> %2
2358}
2359
2360define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
2361; CHECK-LABEL: stack_fold_pmovsxwd:
2362; CHECK:       # %bb.0:
2363; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2364; CHECK-NEXT:    #APP
2365; CHECK-NEXT:    nop
2366; CHECK-NEXT:    #NO_APP
2367; CHECK-NEXT:    vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2368; CHECK-NEXT:    retq
2369  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2370  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2371  %3 = sext <4 x i16> %2 to <4 x i32>
2372  ret <4 x i32> %3
2373}
2374
2375define <8 x i32> @stack_fold_pmovsxwd_ymm(<8 x i16> %a0) {
2376; CHECK-LABEL: stack_fold_pmovsxwd_ymm:
2377; CHECK:       # %bb.0:
2378; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2379; CHECK-NEXT:    #APP
2380; CHECK-NEXT:    nop
2381; CHECK-NEXT:    #NO_APP
2382; CHECK-NEXT:    vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2383; CHECK-NEXT:    retq
2384  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2385  %2 = sext <8 x i16> %a0 to <8 x i32>
2386  ret <8 x i32> %2
2387}
2388
2389define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
2390; CHECK-LABEL: stack_fold_pmovsxwq:
2391; CHECK:       # %bb.0:
2392; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2393; CHECK-NEXT:    #APP
2394; CHECK-NEXT:    nop
2395; CHECK-NEXT:    #NO_APP
2396; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2397; CHECK-NEXT:    retq
2398  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2399  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
2400  %3 = sext <2 x i16> %2 to <2 x i64>
2401  ret <2 x i64> %3
2402}
2403
2404define <4 x i64> @stack_fold_pmovsxwq_ymm(<8 x i16> %a0) {
2405; CHECK-LABEL: stack_fold_pmovsxwq_ymm:
2406; CHECK:       # %bb.0:
2407; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2408; CHECK-NEXT:    #APP
2409; CHECK-NEXT:    nop
2410; CHECK-NEXT:    #NO_APP
2411; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2412; CHECK-NEXT:    retq
2413  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2414  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2415  %3 = sext <4 x i16> %2 to <4 x i64>
2416  ret <4 x i64> %3
2417}
2418
2419define <8 x i16> @stack_fold_vpmovusdw(<8 x i32> %a0) {
2420; CHECK-LABEL: stack_fold_vpmovusdw:
2421; CHECK:       # %bb.0:
2422; CHECK-NEXT:    vpmovusdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2423; CHECK-NEXT:    #APP
2424; CHECK-NEXT:    nop
2425; CHECK-NEXT:    #NO_APP
2426; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2427; CHECK-NEXT:    vzeroupper
2428; CHECK-NEXT:    retq
2429  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
2430  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2431  ret <8 x i16> %1
2432}
2433declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
2434
2435define <4 x i32> @stack_fold_vpmovusqd(<4 x i64> %a0) {
2436; CHECK-LABEL: stack_fold_vpmovusqd:
2437; CHECK:       # %bb.0:
2438; CHECK-NEXT:    vpmovusqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2439; CHECK-NEXT:    #APP
2440; CHECK-NEXT:    nop
2441; CHECK-NEXT:    #NO_APP
2442; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2443; CHECK-NEXT:    vzeroupper
2444; CHECK-NEXT:    retq
2445  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
2446  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2447  ret <4 x i32> %1
2448}
2449declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
2450
2451define <16 x i8> @stack_fold_vpmovuswb(<16 x i16> %a0) {
2452; CHECK-LABEL: stack_fold_vpmovuswb:
2453; CHECK:       # %bb.0:
2454; CHECK-NEXT:    vpmovuswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
2455; CHECK-NEXT:    #APP
2456; CHECK-NEXT:    nop
2457; CHECK-NEXT:    #NO_APP
2458; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2459; CHECK-NEXT:    vzeroupper
2460; CHECK-NEXT:    retq
2461  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
2462  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2463  ret <16 x i8> %1
2464}
2465declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)
2466
2467define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
2468; CHECK-LABEL: stack_fold_pmovzxbd:
2469; CHECK:       # %bb.0:
2470; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2471; CHECK-NEXT:    #APP
2472; CHECK-NEXT:    nop
2473; CHECK-NEXT:    #NO_APP
2474; CHECK-NEXT:    vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2475; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2476; CHECK-NEXT:    retq
2477  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2478  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27>
2479  %3 = bitcast <16 x i8> %2 to <4 x i32>
2480  ret <4 x i32> %3
2481}
2482
2483define <8 x i32> @stack_fold_pmovzxbd_ymm(<16 x i8> %a0) {
2484; CHECK-LABEL: stack_fold_pmovzxbd_ymm:
2485; CHECK:       # %bb.0:
2486; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2487; CHECK-NEXT:    #APP
2488; CHECK-NEXT:    nop
2489; CHECK-NEXT:    #NO_APP
2490; CHECK-NEXT:    vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2491; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
2492; CHECK-NEXT:    retq
2493  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2494  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2495  %3 = zext <8 x i8> %2 to <8 x i32>
2496  ret <8 x i32> %3
2497}
2498
2499define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
2500; CHECK-LABEL: stack_fold_pmovzxbq:
2501; CHECK:       # %bb.0:
2502; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2503; CHECK-NEXT:    #APP
2504; CHECK-NEXT:    nop
2505; CHECK-NEXT:    #NO_APP
2506; CHECK-NEXT:    vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2507; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
2508; CHECK-NEXT:    retq
2509  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2510  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
2511  %3 = bitcast <16 x i8> %2 to <2 x i64>
2512  ret <2 x i64> %3
2513}
2514
2515define <4 x i64> @stack_fold_pmovzxbq_ymm(<16 x i8> %a0) {
2516; CHECK-LABEL: stack_fold_pmovzxbq_ymm:
2517; CHECK:       # %bb.0:
2518; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2519; CHECK-NEXT:    #APP
2520; CHECK-NEXT:    nop
2521; CHECK-NEXT:    #NO_APP
2522; CHECK-NEXT:    vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2523; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
2524; CHECK-NEXT:    retq
2525  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2526  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2527  %3 = zext <4 x i8> %2 to <4 x i64>
2528  ret <4 x i64> %3
2529}
2530
2531define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
2532; CHECK-LABEL: stack_fold_pmovzxbw:
2533; CHECK:       # %bb.0:
2534; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2535; CHECK-NEXT:    #APP
2536; CHECK-NEXT:    nop
2537; CHECK-NEXT:    #NO_APP
2538; CHECK-NEXT:    vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2539; CHECK-NEXT:    # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2540; CHECK-NEXT:    retq
2541  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2542  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
2543  %3 = bitcast <16 x i8> %2 to <8 x i16>
2544  ret <8 x i16> %3
2545}
2546
2547define <16 x i16> @stack_fold_pmovzxbw_ymm(<16 x i8> %a0) {
2548; CHECK-LABEL: stack_fold_pmovzxbw_ymm:
2549; CHECK:       # %bb.0:
2550; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2551; CHECK-NEXT:    #APP
2552; CHECK-NEXT:    nop
2553; CHECK-NEXT:    #NO_APP
2554; CHECK-NEXT:    vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2555; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2556; CHECK-NEXT:    retq
2557  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2558  %2 = zext <16 x i8> %a0 to <16 x i16>
2559  ret <16 x i16> %2
2560}
2561
2562define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
2563; CHECK-LABEL: stack_fold_pmovzxdq:
2564; CHECK:       # %bb.0:
2565; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2566; CHECK-NEXT:    #APP
2567; CHECK-NEXT:    nop
2568; CHECK-NEXT:    #NO_APP
2569; CHECK-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2570; CHECK-NEXT:    # xmm0 = mem[0],zero,mem[1],zero
2571; CHECK-NEXT:    retq
2572  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2573  %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2574  %3 = bitcast <4 x i32> %2 to <2 x i64>
2575  ret <2 x i64> %3
2576}
2577
2578define <4 x i64> @stack_fold_pmovzxdq_ymm(<4 x i32> %a0) {
2579; CHECK-LABEL: stack_fold_pmovzxdq_ymm:
2580; CHECK:       # %bb.0:
2581; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2582; CHECK-NEXT:    #APP
2583; CHECK-NEXT:    nop
2584; CHECK-NEXT:    #NO_APP
2585; CHECK-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2586; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2587; CHECK-NEXT:    retq
2588  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2589  %2 = zext <4 x i32> %a0 to <4 x i64>
2590  ret <4 x i64> %2
2591}
2592
2593define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
2594; CHECK-LABEL: stack_fold_pmovzxwd:
2595; CHECK:       # %bb.0:
2596; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2597; CHECK-NEXT:    #APP
2598; CHECK-NEXT:    nop
2599; CHECK-NEXT:    #NO_APP
2600; CHECK-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2601; CHECK-NEXT:    # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2602; CHECK-NEXT:    retq
2603  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2604  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
2605  %3 = bitcast <8 x i16> %2 to <4 x i32>
2606  ret <4 x i32> %3
2607}
2608
2609define <8 x i32> @stack_fold_pmovzxwd_ymm(<8 x i16> %a0) {
2610; CHECK-LABEL: stack_fold_pmovzxwd_ymm:
2611; CHECK:       # %bb.0:
2612; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2613; CHECK-NEXT:    #APP
2614; CHECK-NEXT:    nop
2615; CHECK-NEXT:    #NO_APP
2616; CHECK-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2617; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2618; CHECK-NEXT:    retq
2619  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2620  %2 = zext <8 x i16> %a0 to <8 x i32>
2621  ret <8 x i32> %2
2622}
2623
2624define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
2625; CHECK-LABEL: stack_fold_pmovzxwq:
2626; CHECK:       # %bb.0:
2627; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2628; CHECK-NEXT:    #APP
2629; CHECK-NEXT:    nop
2630; CHECK-NEXT:    #NO_APP
2631; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2632; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
2633; CHECK-NEXT:    retq
2634  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2635  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13>
2636  %3 = bitcast <8 x i16> %2 to <2 x i64>
2637  ret <2 x i64> %3
2638}
2639
2640define <4 x i64> @stack_fold_pmovzxwq_ymm(<8 x i16> %a0) {
2641; CHECK-LABEL: stack_fold_pmovzxwq_ymm:
2642; CHECK:       # %bb.0:
2643; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2644; CHECK-NEXT:    #APP
2645; CHECK-NEXT:    nop
2646; CHECK-NEXT:    #NO_APP
2647; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
2648; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2649; CHECK-NEXT:    retq
2650  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2651  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2652  %3 = zext <4 x i16> %2 to <4 x i64>
2653  ret <4 x i64> %3
2654}
2655
2656define <4 x i64> @stack_fold_pmovzxwq_maskz_ymm(<8 x i16> %a0, i8 %mask) {
2657; CHECK-LABEL: stack_fold_pmovzxwq_maskz_ymm:
2658; CHECK:       # %bb.0:
2659; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2660; CHECK-NEXT:    #APP
2661; CHECK-NEXT:    nop
2662; CHECK-NEXT:    #NO_APP
2663; CHECK-NEXT:    kmovd %edi, %k1
2664; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 16-byte Folded Reload
2665; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2666; CHECK-NEXT:    retq
2667  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2668  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2669  %3 = zext <4 x i16> %2 to <4 x i64>
2670  %4 = bitcast i8 %mask to <8 x i1>
2671  %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2672  %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> zeroinitializer
2673  ret <4 x i64> %6
2674}
2675
2676define <4 x i64> @stack_fold_pmovzxwq_mask_ymm(<4 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
2677; CHECK-LABEL: stack_fold_pmovzxwq_mask_ymm:
2678; CHECK:       # %bb.0:
2679; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2680; CHECK-NEXT:    #APP
2681; CHECK-NEXT:    nop
2682; CHECK-NEXT:    #NO_APP
2683; CHECK-NEXT:    kmovd %edi, %k1
2684; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 16-byte Folded Reload
2685; CHECK-NEXT:    # ymm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2686; CHECK-NEXT:    retq
2687  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2688  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2689  %3 = zext <4 x i16> %2 to <4 x i64>
2690  %4 = bitcast i8 %mask to <8 x i1>
2691  %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2692  %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %passthru
2693  ret <4 x i64> %6
2694}
2695
2696define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
2697; CHECK-LABEL: stack_fold_pmuldq:
2698; CHECK:       # %bb.0:
2699; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2700; CHECK-NEXT:    #APP
2701; CHECK-NEXT:    nop
2702; CHECK-NEXT:    #NO_APP
2703; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2704; CHECK-NEXT:    retq
2705  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2706  %2 = bitcast <4 x i32> %a0 to <2 x i64>
2707  %3 = bitcast <4 x i32> %a1 to <2 x i64>
2708  %4 = shl <2 x i64> %2, <i64 32, i64 32>
2709  %5 = ashr <2 x i64> %4, <i64 32, i64 32>
2710  %6 = shl <2 x i64> %3, <i64 32, i64 32>
2711  %7 = ashr <2 x i64> %6, <i64 32, i64 32>
2712  %8 = mul <2 x i64> %5, %7
2713  ret <2 x i64> %8
2714}
2715
2716define <4 x i64> @stack_fold_pmuldq_ymm(<8 x i32> %a0, <8 x i32> %a1) {
2717; CHECK-LABEL: stack_fold_pmuldq_ymm:
2718; CHECK:       # %bb.0:
2719; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2720; CHECK-NEXT:    #APP
2721; CHECK-NEXT:    nop
2722; CHECK-NEXT:    #NO_APP
2723; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2724; CHECK-NEXT:    retq
2725  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2726  %2 = bitcast <8 x i32> %a0 to <4 x i64>
2727  %3 = bitcast <8 x i32> %a1 to <4 x i64>
2728  %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32>
2729  %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
2730  %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32>
2731  %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
2732  %8 = mul <4 x i64> %5, %7
2733  ret <4 x i64> %8
2734}
2735
2736define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
2737; CHECK-LABEL: stack_fold_pmuludq:
2738; CHECK:       # %bb.0:
2739; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2740; CHECK-NEXT:    #APP
2741; CHECK-NEXT:    nop
2742; CHECK-NEXT:    #NO_APP
2743; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2744; CHECK-NEXT:    retq
2745  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2746  %2 = bitcast <4 x i32> %a0 to <2 x i64>
2747  %3 = bitcast <4 x i32> %a1 to <2 x i64>
2748  %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295>
2749  %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295>
2750  %6 = mul <2 x i64> %4, %5
2751  ret <2 x i64> %6
2752}
2753
2754define <4 x i64> @stack_fold_pmuludq_ymm(<8 x i32> %a0, <8 x i32> %a1) {
2755; CHECK-LABEL: stack_fold_pmuludq_ymm:
2756; CHECK:       # %bb.0:
2757; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2758; CHECK-NEXT:    #APP
2759; CHECK-NEXT:    nop
2760; CHECK-NEXT:    #NO_APP
2761; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2762; CHECK-NEXT:    retq
2763  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2764  %2 = bitcast <8 x i32> %a0 to <4 x i64>
2765  %3 = bitcast <8 x i32> %a1 to <4 x i64>
2766  %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2767  %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2768  %6 = mul <4 x i64> %4, %5
2769  ret <4 x i64> %6
2770}
2771
2772define <4 x i64> @stack_fold_pmuludq_ymm_mask(ptr %passthru, <8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
2773; CHECK-LABEL: stack_fold_pmuludq_ymm_mask:
2774; CHECK:       # %bb.0:
2775; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2776; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2777; CHECK-NEXT:    #APP
2778; CHECK-NEXT:    nop
2779; CHECK-NEXT:    #NO_APP
2780; CHECK-NEXT:    kmovd %esi, %k1
2781; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
2782; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2783; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload
2784; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2785; CHECK-NEXT:    retq
2786  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2787  %2 = bitcast <8 x i32> %a0 to <4 x i64>
2788  %3 = bitcast <8 x i32> %a1 to <4 x i64>
2789  %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2790  %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2791  %6 = mul <4 x i64> %4, %5
2792  %7 = bitcast i8 %mask to <8 x i1>
2793  %8 = shufflevector <8 x i1> %7, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2794  %9 = load <4 x i64>, ptr %passthru
2795  %10 = select <4 x i1> %8, <4 x i64> %6, <4 x i64> %9
2796  ret <4 x i64> %10
2797}
2798
2799define <4 x i64> @stack_fold_pmuludq_ymm_maskz(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
2800; CHECK-LABEL: stack_fold_pmuludq_ymm_maskz:
2801; CHECK:       # %bb.0:
2802; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2803; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2804; CHECK-NEXT:    #APP
2805; CHECK-NEXT:    nop
2806; CHECK-NEXT:    #NO_APP
2807; CHECK-NEXT:    kmovd %edi, %k1
2808; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2809; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
2810; CHECK-NEXT:    retq
2811  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2812  %2 = bitcast <8 x i32> %a0 to <4 x i64>
2813  %3 = bitcast <8 x i32> %a1 to <4 x i64>
2814  %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2815  %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
2816  %6 = mul <4 x i64> %4, %5
2817  %7 = bitcast i8 %mask to <8 x i1>
2818  %8 = shufflevector <8 x i1> %7, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2819  %9 = select <4 x i1> %8, <4 x i64> %6, <4 x i64> zeroinitializer
2820  ret <4 x i64> %9
2821}
2822
2823define <4 x i32> @stack_fold_vpopcntd(<4 x i32> %a0) {
2824; CHECK-LABEL: stack_fold_vpopcntd:
2825; CHECK:       # %bb.0:
2826; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2827; CHECK-NEXT:    #APP
2828; CHECK-NEXT:    nop
2829; CHECK-NEXT:    #NO_APP
2830; CHECK-NEXT:    vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2831; CHECK-NEXT:    retq
2832  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2833  %2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a0)
2834  ret <4 x i32> %2
2835}
2836declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readonly
2837
2838define <8 x i32> @stack_fold_vpopcntd_ymm(<8 x i32> %a0) {
2839; CHECK-LABEL: stack_fold_vpopcntd_ymm:
2840; CHECK:       # %bb.0:
2841; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2842; CHECK-NEXT:    #APP
2843; CHECK-NEXT:    nop
2844; CHECK-NEXT:    #NO_APP
2845; CHECK-NEXT:    vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2846; CHECK-NEXT:    retq
2847  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2848  %2 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a0)
2849  ret <8 x i32> %2
2850}
2851declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readonly
2852
2853define <2 x i64> @stack_fold_vpopcntq(<2 x i64> %a0) {
2854; CHECK-LABEL: stack_fold_vpopcntq:
2855; CHECK:       # %bb.0:
2856; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2857; CHECK-NEXT:    #APP
2858; CHECK-NEXT:    nop
2859; CHECK-NEXT:    #NO_APP
2860; CHECK-NEXT:    vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2861; CHECK-NEXT:    retq
2862  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2863  %2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a0)
2864  ret <2 x i64> %2
2865}
2866declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
2867
2868define <4 x i64> @stack_fold_vpopcntq_ymm(<4 x i64> %a0) {
2869; CHECK-LABEL: stack_fold_vpopcntq_ymm:
2870; CHECK:       # %bb.0:
2871; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2872; CHECK-NEXT:    #APP
2873; CHECK-NEXT:    nop
2874; CHECK-NEXT:    #NO_APP
2875; CHECK-NEXT:    vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2876; CHECK-NEXT:    retq
2877  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2878  %2 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a0)
2879  ret <4 x i64> %2
2880}
2881declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
2882
2883define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
2884; CHECK-LABEL: stack_fold_psadbw:
2885; CHECK:       # %bb.0:
2886; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2887; CHECK-NEXT:    #APP
2888; CHECK-NEXT:    nop
2889; CHECK-NEXT:    #NO_APP
2890; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2891; CHECK-NEXT:    retq
2892  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2893  %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
2894  ret <2 x i64> %2
2895}
2896declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
2897
2898define <2 x i64> @stack_fold_psadbw_commute(<16 x i8> %a0, <16 x i8> %a1) {
2899; CHECK-LABEL: stack_fold_psadbw_commute:
2900; CHECK:       # %bb.0:
2901; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2902; CHECK-NEXT:    #APP
2903; CHECK-NEXT:    nop
2904; CHECK-NEXT:    #NO_APP
2905; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2906; CHECK-NEXT:    retq
2907  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2908  %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a1, <16 x i8> %a0)
2909  ret <2 x i64> %2
2910}
2911
2912define <4 x i64> @stack_fold_psadbw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
2913; CHECK-LABEL: stack_fold_psadbw_ymm:
2914; CHECK:       # %bb.0:
2915; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2916; CHECK-NEXT:    #APP
2917; CHECK-NEXT:    nop
2918; CHECK-NEXT:    #NO_APP
2919; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2920; CHECK-NEXT:    retq
2921  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2922  %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
2923  ret <4 x i64> %2
2924}
2925declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2926
2927define <4 x i64> @stack_fold_psadbw_ymm_commute(<32 x i8> %a0, <32 x i8> %a1) {
2928; CHECK-LABEL: stack_fold_psadbw_ymm_commute:
2929; CHECK:       # %bb.0:
2930; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2931; CHECK-NEXT:    #APP
2932; CHECK-NEXT:    nop
2933; CHECK-NEXT:    #NO_APP
2934; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2935; CHECK-NEXT:    retq
2936  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2937  %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a1, <32 x i8> %a0)
2938  ret <4 x i64> %2
2939}
2940
2941define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
2942; CHECK-LABEL: stack_fold_pshufb:
2943; CHECK:       # %bb.0:
2944; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2945; CHECK-NEXT:    #APP
2946; CHECK-NEXT:    nop
2947; CHECK-NEXT:    #NO_APP
2948; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2949; CHECK-NEXT:    retq
2950  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2951  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
2952  ret <16 x i8> %2
2953}
2954declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
2955
2956define <16 x i8> @stack_fold_pshufb_mask(ptr %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
2957; CHECK-LABEL: stack_fold_pshufb_mask:
2958; CHECK:       # %bb.0:
2959; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2960; CHECK-NEXT:    #APP
2961; CHECK-NEXT:    nop
2962; CHECK-NEXT:    #NO_APP
2963; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
2964; CHECK-NEXT:    kmovd %esi, %k1
2965; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
2966; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
2967; CHECK-NEXT:    retq
2968  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2969  %2 = load <16 x i8>, ptr %passthru
2970  %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
2971  %4 = bitcast i16 %mask to <16 x i1>
2972  %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %2
2973  ret <16 x i8> %5
2974}
2975
2976define <16 x i8> @stack_fold_pshufb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
2977; CHECK-LABEL: stack_fold_pshufb_maskz:
2978; CHECK:       # %bb.0:
2979; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2980; CHECK-NEXT:    #APP
2981; CHECK-NEXT:    nop
2982; CHECK-NEXT:    #NO_APP
2983; CHECK-NEXT:    kmovd %edi, %k1
2984; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
2985; CHECK-NEXT:    retq
2986  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2987  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
2988  %3 = bitcast i16 %mask to <16 x i1>
2989  %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer
2990  ret <16 x i8> %4
2991}
2992
2993define <32 x i8> @stack_fold_pshufb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
2994; CHECK-LABEL: stack_fold_pshufb_ymm:
2995; CHECK:       # %bb.0:
2996; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2997; CHECK-NEXT:    #APP
2998; CHECK-NEXT:    nop
2999; CHECK-NEXT:    #NO_APP
3000; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3001; CHECK-NEXT:    retq
3002  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3003  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
3004  ret <32 x i8> %2
3005}
3006declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
3007
3008define <32 x i8> @stack_fold_pshufb_ymm_mask(ptr %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
3009; CHECK-LABEL: stack_fold_pshufb_ymm_mask:
3010; CHECK:       # %bb.0:
3011; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3012; CHECK-NEXT:    #APP
3013; CHECK-NEXT:    nop
3014; CHECK-NEXT:    #NO_APP
3015; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
3016; CHECK-NEXT:    kmovd %esi, %k1
3017; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
3018; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
3019; CHECK-NEXT:    retq
3020  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3021  %2 = load <32 x i8>, ptr %passthru
3022  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
3023  %4 = bitcast i32 %mask to <32 x i1>
3024  %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %2
3025  ret <32 x i8> %5
3026}
3027
3028define <32 x i8> @stack_fold_pshufb_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
3029; CHECK-LABEL: stack_fold_pshufb_ymm_maskz:
3030; CHECK:       # %bb.0:
3031; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3032; CHECK-NEXT:    #APP
3033; CHECK-NEXT:    nop
3034; CHECK-NEXT:    #NO_APP
3035; CHECK-NEXT:    kmovd %edi, %k1
3036; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
3037; CHECK-NEXT:    retq
3038  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3039  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
3040  %3 = bitcast i32 %mask to <32 x i1>
3041  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
3042  ret <32 x i8> %4
3043}
3044
3045define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
3046; CHECK-LABEL: stack_fold_pshufd:
3047; CHECK:       # %bb.0:
3048; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3049; CHECK-NEXT:    #APP
3050; CHECK-NEXT:    nop
3051; CHECK-NEXT:    #NO_APP
3052; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3053; CHECK-NEXT:    # xmm0 = mem[3,2,1,0]
3054; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
3055; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
3056; CHECK-NEXT:    retq
3057  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3058  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3059  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
3060  ret <4 x i32> %3
3061}
3062
3063define <4 x i32> @stack_fold_pshufd_mask(<4 x i32> %passthru, <4 x i32> %a0, i8 %mask) {
3064; CHECK-LABEL: stack_fold_pshufd_mask:
3065; CHECK:       # %bb.0:
3066; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3067; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3068; CHECK-NEXT:    #APP
3069; CHECK-NEXT:    nop
3070; CHECK-NEXT:    #NO_APP
3071; CHECK-NEXT:    kmovd %edi, %k1
3072; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3073; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload
3074; CHECK-NEXT:    # xmm0 {%k1} = mem[3,2,1,0]
3075; CHECK-NEXT:    retq
3076  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3077  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3078  %3 = bitcast i8 %mask to <8 x i1>
3079  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3080  %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %passthru
3081  ret <4 x i32> %5
3082}
3083
3084define <4 x i32> @stack_fold_pshufd_maskz(<4 x i32> %a0, i8 %mask) {
3085; CHECK-LABEL: stack_fold_pshufd_maskz:
3086; CHECK:       # %bb.0:
3087; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3088; CHECK-NEXT:    #APP
3089; CHECK-NEXT:    nop
3090; CHECK-NEXT:    #NO_APP
3091; CHECK-NEXT:    kmovd %edi, %k1
3092; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
3093; CHECK-NEXT:    # xmm0 {%k1} {z} = mem[3,2,1,0]
3094; CHECK-NEXT:    retq
3095  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3096  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3097  %3 = bitcast i8 %mask to <8 x i1>
3098  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3099  %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
3100  ret <4 x i32> %5
3101}
3102
3103define <8 x i32> @stack_fold_pshufd_ymm(<8 x i32> %a0) {
3104; CHECK-LABEL: stack_fold_pshufd_ymm:
3105; CHECK:       # %bb.0:
3106; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3107; CHECK-NEXT:    #APP
3108; CHECK-NEXT:    nop
3109; CHECK-NEXT:    #NO_APP
3110; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3111; CHECK-NEXT:    # ymm0 = mem[3,2,1,0,7,6,5,4]
3112; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
3113; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
3114; CHECK-NEXT:    retq
3115  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3116  %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
3117  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3118  ret <8 x i32> %3
3119}
3120
3121define <8 x i32> @stack_fold_pshufd_ymm_mask(<8 x i32> %passthru, <8 x i32> %a0, i8 %mask) {
3122; CHECK-LABEL: stack_fold_pshufd_ymm_mask:
3123; CHECK:       # %bb.0:
3124; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3125; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3126; CHECK-NEXT:    #APP
3127; CHECK-NEXT:    nop
3128; CHECK-NEXT:    #NO_APP
3129; CHECK-NEXT:    kmovd %edi, %k1
3130; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3131; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload
3132; CHECK-NEXT:    # ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
3133; CHECK-NEXT:    retq
3134  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3135  %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
3136  %3 = bitcast i8 %mask to <8 x i1>
3137  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %passthru
3138  ret <8 x i32> %4
3139}
3140
3141define <8 x i32> @stack_fold_pshufd_ymm_maskz(<8 x i32> %a0, i8 %mask) {
3142; CHECK-LABEL: stack_fold_pshufd_ymm_maskz:
3143; CHECK:       # %bb.0:
3144; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3145; CHECK-NEXT:    #APP
3146; CHECK-NEXT:    nop
3147; CHECK-NEXT:    #NO_APP
3148; CHECK-NEXT:    kmovd %edi, %k1
3149; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
3150; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
3151; CHECK-NEXT:    retq
3152  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3153  %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
3154  %3 = bitcast i8 %mask to <8 x i1>
3155  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
3156  ret <8 x i32> %4
3157}
3158
3159define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
3160; CHECK-LABEL: stack_fold_pshufhw:
3161; CHECK:       # %bb.0:
3162; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3163; CHECK-NEXT:    #APP
3164; CHECK-NEXT:    nop
3165; CHECK-NEXT:    #NO_APP
3166; CHECK-NEXT:    vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3167; CHECK-NEXT:    # xmm0 = mem[0,1,2,3,7,6,4,4]
3168; CHECK-NEXT:    retq
3169  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3170  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
3171  ret <8 x i16> %2
3172}
3173
3174define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
3175; CHECK-LABEL: stack_fold_pshufhw_mask:
3176; CHECK:       # %bb.0:
3177; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3178; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3179; CHECK-NEXT:    #APP
3180; CHECK-NEXT:    nop
3181; CHECK-NEXT:    #NO_APP
3182; CHECK-NEXT:    kmovd %edi, %k1
3183; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3184; CHECK-NEXT:    vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload
3185; CHECK-NEXT:    # xmm0 {%k1} = mem[0,1,2,3,7,6,4,4]
3186; CHECK-NEXT:    retq
3187  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3188  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
3189  %3 = bitcast i8 %mask to <8 x i1>
3190  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru
3191  ret <8 x i16> %4
3192}
3193
3194define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) {
3195; CHECK-LABEL: stack_fold_pshufhw_maskz:
3196; CHECK:       # %bb.0:
3197; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3198; CHECK-NEXT:    #APP
3199; CHECK-NEXT:    nop
3200; CHECK-NEXT:    #NO_APP
3201; CHECK-NEXT:    kmovd %edi, %k1
3202; CHECK-NEXT:    vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
3203; CHECK-NEXT:    # xmm0 {%k1} {z} = mem[0,1,2,3,7,6,4,4]
3204; CHECK-NEXT:    retq
3205  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3206  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
3207  %3 = bitcast i8 %mask to <8 x i1>
3208  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
3209  ret <8 x i16> %4
3210}
3211
3212define <16 x i16> @stack_fold_pshufhw_ymm(<16 x i16> %a0) {
3213; CHECK-LABEL: stack_fold_pshufhw_ymm:
3214; CHECK:       # %bb.0:
3215; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3216; CHECK-NEXT:    #APP
3217; CHECK-NEXT:    nop
3218; CHECK-NEXT:    #NO_APP
3219; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3220; CHECK-NEXT:    # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
3221; CHECK-NEXT:    retq
3222  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3223  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
3224  ret <16 x i16> %2
3225}
3226
3227define <16 x i16> @stack_fold_pshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
3228; CHECK-LABEL: stack_fold_pshufhw_ymm_mask:
3229; CHECK:       # %bb.0:
3230; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3231; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3232; CHECK-NEXT:    #APP
3233; CHECK-NEXT:    nop
3234; CHECK-NEXT:    #NO_APP
3235; CHECK-NEXT:    kmovd %edi, %k1
3236; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3237; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload
3238; CHECK-NEXT:    # ymm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
3239; CHECK-NEXT:    retq
3240  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3241  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
3242  %3 = bitcast i16 %mask to <16 x i1>
3243  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru
3244  ret <16 x i16> %4
3245}
3246
3247define <16 x i16> @stack_fold_pshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
3248; CHECK-LABEL: stack_fold_pshufhw_ymm_maskz:
3249; CHECK:       # %bb.0:
3250; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3251; CHECK-NEXT:    #APP
3252; CHECK-NEXT:    nop
3253; CHECK-NEXT:    #NO_APP
3254; CHECK-NEXT:    kmovd %edi, %k1
3255; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
3256; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
3257; CHECK-NEXT:    retq
3258  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3259  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
3260  %3 = bitcast i16 %mask to <16 x i1>
3261  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
3262  ret <16 x i16> %4
3263}
3264
3265define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
3266; CHECK-LABEL: stack_fold_pshuflw:
3267; CHECK:       # %bb.0:
3268; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3269; CHECK-NEXT:    #APP
3270; CHECK-NEXT:    nop
3271; CHECK-NEXT:    #NO_APP
3272; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3273; CHECK-NEXT:    # xmm0 = mem[3,2,1,0,4,5,6,7]
3274; CHECK-NEXT:    retq
3275  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3276  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
3277  ret <8 x i16> %2
3278}
3279
3280define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
3281; CHECK-LABEL: stack_fold_pshuflw_mask:
3282; CHECK:       # %bb.0:
3283; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3284; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3285; CHECK-NEXT:    #APP
3286; CHECK-NEXT:    nop
3287; CHECK-NEXT:    #NO_APP
3288; CHECK-NEXT:    kmovd %edi, %k1
3289; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3290; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload
3291; CHECK-NEXT:    # xmm0 {%k1} = mem[3,2,1,0,4,5,6,7]
3292; CHECK-NEXT:    retq
3293  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3294  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
3295  %3 = bitcast i8 %mask to <8 x i1>
3296  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru
3297  ret <8 x i16> %4
3298}
3299
3300define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) {
3301; CHECK-LABEL: stack_fold_pshuflw_maskz:
3302; CHECK:       # %bb.0:
3303; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3304; CHECK-NEXT:    #APP
3305; CHECK-NEXT:    nop
3306; CHECK-NEXT:    #NO_APP
3307; CHECK-NEXT:    kmovd %edi, %k1
3308; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
3309; CHECK-NEXT:    # xmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7]
3310; CHECK-NEXT:    retq
3311  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3312  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
3313  %3 = bitcast i8 %mask to <8 x i1>
3314  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
3315  ret <8 x i16> %4
3316}
3317
3318define <16 x i16> @stack_fold_pshuflw_ymm(<16 x i16> %a0) {
3319; CHECK-LABEL: stack_fold_pshuflw_ymm:
3320; CHECK:       # %bb.0:
3321; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3322; CHECK-NEXT:    #APP
3323; CHECK-NEXT:    nop
3324; CHECK-NEXT:    #NO_APP
3325; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3326; CHECK-NEXT:    # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
3327; CHECK-NEXT:    retq
3328  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3329  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
3330  ret <16 x i16> %2
3331}
3332
3333define <16 x i16> @stack_fold_pshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
3334; CHECK-LABEL: stack_fold_pshuflw_ymm_mask:
3335; CHECK:       # %bb.0:
3336; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3337; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3338; CHECK-NEXT:    #APP
3339; CHECK-NEXT:    nop
3340; CHECK-NEXT:    #NO_APP
3341; CHECK-NEXT:    kmovd %edi, %k1
3342; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3343; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload
3344; CHECK-NEXT:    # ymm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
3345; CHECK-NEXT:    retq
3346  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3347  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
3348  %3 = bitcast i16 %mask to <16 x i1>
3349  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru
3350  ret <16 x i16> %4
3351}
3352
3353define <16 x i16> @stack_fold_pshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
3354; CHECK-LABEL: stack_fold_pshuflw_ymm_maskz:
3355; CHECK:       # %bb.0:
3356; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3357; CHECK-NEXT:    #APP
3358; CHECK-NEXT:    nop
3359; CHECK-NEXT:    #NO_APP
3360; CHECK-NEXT:    kmovd %edi, %k1
3361; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
3362; CHECK-NEXT:    # ymm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
3363; CHECK-NEXT:    retq
3364  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3365  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
3366  %3 = bitcast i16 %mask to <16 x i1>
3367  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
3368  ret <16 x i16> %4
3369}
3370
3371define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
3372; CHECK-LABEL: stack_fold_pslld:
3373; CHECK:       # %bb.0:
3374; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3375; CHECK-NEXT:    #APP
3376; CHECK-NEXT:    nop
3377; CHECK-NEXT:    #NO_APP
3378; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3379; CHECK-NEXT:    retq
3380  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3381  %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
3382  ret <4 x i32> %2
3383}
3384declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
3385
3386define <8 x i32> @stack_fold_pslld_ymm(<8 x i32> %a0, <4 x i32> %a1) {
3387; CHECK-LABEL: stack_fold_pslld_ymm:
3388; CHECK:       # %bb.0:
3389; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3390; CHECK-NEXT:    #APP
3391; CHECK-NEXT:    nop
3392; CHECK-NEXT:    #NO_APP
3393; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3394; CHECK-NEXT:    retq
3395  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3396  %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
3397  ret <8 x i32> %2
3398}
3399declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
3400
3401define <16 x i8> @stack_fold_pslldq(<16 x i8> %a) {
3402; CHECK-LABEL: stack_fold_pslldq:
3403; CHECK:       # %bb.0:
3404; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3405; CHECK-NEXT:    #APP
3406; CHECK-NEXT:    nop
3407; CHECK-NEXT:    #NO_APP
3408; CHECK-NEXT:    vpslldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3409; CHECK-NEXT:    # xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0,1,2,3]
3410; CHECK-NEXT:    retq
3411  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3412  %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 17, i32 18, i32 19>
3413  ret <16 x i8> %2
3414}
3415
3416define <32 x i8> @stack_fold_pslldq_ymm(<32 x i8> %a) {
3417; CHECK-LABEL: stack_fold_pslldq_ymm:
3418; CHECK:       # %bb.0:
3419; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3420; CHECK-NEXT:    #APP
3421; CHECK-NEXT:    nop
3422; CHECK-NEXT:    #NO_APP
3423; CHECK-NEXT:    vpslldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3424; CHECK-NEXT:    # ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[16]
3425; CHECK-NEXT:    retq
3426  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3427  %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48>
3428  ret <32 x i8> %2
3429}
3430
3431define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
3432; CHECK-LABEL: stack_fold_psllq:
3433; CHECK:       # %bb.0:
3434; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3435; CHECK-NEXT:    #APP
3436; CHECK-NEXT:    nop
3437; CHECK-NEXT:    #NO_APP
3438; CHECK-NEXT:    vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3439; CHECK-NEXT:    retq
3440  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3441  %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
3442  ret <2 x i64> %2
3443}
3444declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
3445
3446define <4 x i64> @stack_fold_psllq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
3447; CHECK-LABEL: stack_fold_psllq_ymm:
3448; CHECK:       # %bb.0:
3449; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3450; CHECK-NEXT:    #APP
3451; CHECK-NEXT:    nop
3452; CHECK-NEXT:    #NO_APP
3453; CHECK-NEXT:    vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3454; CHECK-NEXT:    retq
3455  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3456  %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
3457  ret <4 x i64> %2
3458}
3459declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
3460
3461define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) {
3462; CHECK-LABEL: stack_fold_psllvd:
3463; CHECK:       # %bb.0:
3464; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3465; CHECK-NEXT:    #APP
3466; CHECK-NEXT:    nop
3467; CHECK-NEXT:    #NO_APP
3468; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3469; CHECK-NEXT:    retq
3470  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3471  %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
3472  ret <4 x i32> %2
3473}
3474declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
3475
3476define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
3477; CHECK-LABEL: stack_fold_psllvd_ymm:
3478; CHECK:       # %bb.0:
3479; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3480; CHECK-NEXT:    #APP
3481; CHECK-NEXT:    nop
3482; CHECK-NEXT:    #NO_APP
3483; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3484; CHECK-NEXT:    retq
3485  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3486  %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
3487  ret <8 x i32> %2
3488}
3489declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
3490
3491define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) {
3492; CHECK-LABEL: stack_fold_psllvq:
3493; CHECK:       # %bb.0:
3494; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3495; CHECK-NEXT:    #APP
3496; CHECK-NEXT:    nop
3497; CHECK-NEXT:    #NO_APP
3498; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3499; CHECK-NEXT:    retq
3500  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3501  %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
3502  ret <2 x i64> %2
3503}
3504declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
3505
3506define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
3507; CHECK-LABEL: stack_fold_psllvq_ymm:
3508; CHECK:       # %bb.0:
3509; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3510; CHECK-NEXT:    #APP
3511; CHECK-NEXT:    nop
3512; CHECK-NEXT:    #NO_APP
3513; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3514; CHECK-NEXT:    retq
3515  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3516  %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
3517  ret <4 x i64> %2
3518}
3519declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
3520
3521define <8 x i16> @stack_fold_psllvw(<8 x i16> %a0, <8 x i16> %a1) {
3522; CHECK-LABEL: stack_fold_psllvw:
3523; CHECK:       # %bb.0:
3524; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3525; CHECK-NEXT:    #APP
3526; CHECK-NEXT:    nop
3527; CHECK-NEXT:    #NO_APP
3528; CHECK-NEXT:    vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3529; CHECK-NEXT:    retq
3530  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3531  %2 = call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %a0, <8 x i16> %a1)
3532  ret <8 x i16> %2
3533}
3534declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) nounwind readnone
3535
3536define <16 x i16> @stack_fold_psllvw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
3537; CHECK-LABEL: stack_fold_psllvw_ymm:
3538; CHECK:       # %bb.0:
3539; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3540; CHECK-NEXT:    #APP
3541; CHECK-NEXT:    nop
3542; CHECK-NEXT:    #NO_APP
3543; CHECK-NEXT:    vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3544; CHECK-NEXT:    retq
3545  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3546  %2 = call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %a0, <16 x i16> %a1)
3547  ret <16 x i16> %2
3548}
3549declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) nounwind readnone
3550
3551define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
3552; CHECK-LABEL: stack_fold_psllw:
3553; CHECK:       # %bb.0:
3554; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3555; CHECK-NEXT:    #APP
3556; CHECK-NEXT:    nop
3557; CHECK-NEXT:    #NO_APP
3558; CHECK-NEXT:    vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3559; CHECK-NEXT:    retq
3560  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3561  %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
3562  ret <8 x i16> %2
3563}
3564declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
3565
3566define <16 x i16> @stack_fold_psllw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
3567; CHECK-LABEL: stack_fold_psllw_ymm:
3568; CHECK:       # %bb.0:
3569; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3570; CHECK-NEXT:    #APP
3571; CHECK-NEXT:    nop
3572; CHECK-NEXT:    #NO_APP
3573; CHECK-NEXT:    vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3574; CHECK-NEXT:    retq
3575  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3576  %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
3577  ret <16 x i16> %2
3578}
3579declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
3580
3581define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
3582; CHECK-LABEL: stack_fold_psrad:
3583; CHECK:       # %bb.0:
3584; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3585; CHECK-NEXT:    #APP
3586; CHECK-NEXT:    nop
3587; CHECK-NEXT:    #NO_APP
3588; CHECK-NEXT:    vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3589; CHECK-NEXT:    retq
3590  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3591  %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
3592  ret <4 x i32> %2
3593}
3594declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
3595
3596define <8 x i32> @stack_fold_psrad_ymm(<8 x i32> %a0, <4 x i32> %a1) {
3597; CHECK-LABEL: stack_fold_psrad_ymm:
3598; CHECK:       # %bb.0:
3599; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3600; CHECK-NEXT:    #APP
3601; CHECK-NEXT:    nop
3602; CHECK-NEXT:    #NO_APP
3603; CHECK-NEXT:    vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3604; CHECK-NEXT:    retq
3605  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3606  %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
3607  ret <8 x i32> %2
3608}
3609declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
3610
3611define <2 x i64> @stack_fold_psraq(<2 x i64> %a0, <2 x i64> %a1) {
3612; CHECK-LABEL: stack_fold_psraq:
3613; CHECK:       # %bb.0:
3614; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3615; CHECK-NEXT:    #APP
3616; CHECK-NEXT:    nop
3617; CHECK-NEXT:    #NO_APP
3618; CHECK-NEXT:    vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3619; CHECK-NEXT:    retq
3620  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3621  %2 = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1)
3622  ret <2 x i64> %2
3623}
3624declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind readnone
3625
3626define <4 x i64> @stack_fold_psraq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
3627; CHECK-LABEL: stack_fold_psraq_ymm:
3628; CHECK:       # %bb.0:
3629; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3630; CHECK-NEXT:    #APP
3631; CHECK-NEXT:    nop
3632; CHECK-NEXT:    #NO_APP
3633; CHECK-NEXT:    vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3634; CHECK-NEXT:    retq
3635  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3636  %2 = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1)
3637  ret <4 x i64> %2
3638}
3639declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind readnone
3640
3641define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) {
3642; CHECK-LABEL: stack_fold_psravd:
3643; CHECK:       # %bb.0:
3644; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3645; CHECK-NEXT:    #APP
3646; CHECK-NEXT:    nop
3647; CHECK-NEXT:    #NO_APP
3648; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3649; CHECK-NEXT:    retq
3650  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3651  %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
3652  ret <4 x i32> %2
3653}
3654declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
3655
3656define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
3657; CHECK-LABEL: stack_fold_psravd_ymm:
3658; CHECK:       # %bb.0:
3659; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3660; CHECK-NEXT:    #APP
3661; CHECK-NEXT:    nop
3662; CHECK-NEXT:    #NO_APP
3663; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3664; CHECK-NEXT:    retq
3665  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3666  %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
3667  ret <8 x i32> %2
3668}
3669declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
3670
3671define <2 x i64> @stack_fold_psravq(<2 x i64> %a0, <2 x i64> %a1) {
3672; CHECK-LABEL: stack_fold_psravq:
3673; CHECK:       # %bb.0:
3674; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3675; CHECK-NEXT:    #APP
3676; CHECK-NEXT:    nop
3677; CHECK-NEXT:    #NO_APP
3678; CHECK-NEXT:    vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3679; CHECK-NEXT:    retq
3680  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3681  %2 = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
3682  ret <2 x i64> %2
3683}
3684declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind readnone
3685
3686define <4 x i64> @stack_fold_psravq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
3687; CHECK-LABEL: stack_fold_psravq_ymm:
3688; CHECK:       # %bb.0:
3689; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3690; CHECK-NEXT:    #APP
3691; CHECK-NEXT:    nop
3692; CHECK-NEXT:    #NO_APP
3693; CHECK-NEXT:    vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3694; CHECK-NEXT:    retq
3695  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3696  %2 = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
3697  ret <4 x i64> %2
3698}
3699declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone
3700
3701define <8 x i16> @stack_fold_psravw(<8 x i16> %a0, <8 x i16> %a1) {
3702; CHECK-LABEL: stack_fold_psravw:
3703; CHECK:       # %bb.0:
3704; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3705; CHECK-NEXT:    #APP
3706; CHECK-NEXT:    nop
3707; CHECK-NEXT:    #NO_APP
3708; CHECK-NEXT:    vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3709; CHECK-NEXT:    retq
3710  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3711  %2 = call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %a0, <8 x i16> %a1)
3712  ret <8 x i16> %2
3713}
3714declare <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16>, <8 x i16>) nounwind readnone
3715
3716define <16 x i16> @stack_fold_psravw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
3717; CHECK-LABEL: stack_fold_psravw_ymm:
3718; CHECK:       # %bb.0:
3719; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3720; CHECK-NEXT:    #APP
3721; CHECK-NEXT:    nop
3722; CHECK-NEXT:    #NO_APP
3723; CHECK-NEXT:    vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3724; CHECK-NEXT:    retq
3725  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3726  %2 = call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %a0, <16 x i16> %a1)
3727  ret <16 x i16> %2
3728}
3729declare <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16>, <16 x i16>) nounwind readnone
3730
3731define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
3732; CHECK-LABEL: stack_fold_psraw:
3733; CHECK:       # %bb.0:
3734; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3735; CHECK-NEXT:    #APP
3736; CHECK-NEXT:    nop
3737; CHECK-NEXT:    #NO_APP
3738; CHECK-NEXT:    vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3739; CHECK-NEXT:    retq
3740  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3741  %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
3742  ret <8 x i16> %2
3743}
3744declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
3745
3746define <16 x i16> @stack_fold_psraw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
3747; CHECK-LABEL: stack_fold_psraw_ymm:
3748; CHECK:       # %bb.0:
3749; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3750; CHECK-NEXT:    #APP
3751; CHECK-NEXT:    nop
3752; CHECK-NEXT:    #NO_APP
3753; CHECK-NEXT:    vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3754; CHECK-NEXT:    retq
3755  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3756  %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
3757  ret <16 x i16> %2
3758}
3759declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
3760
3761define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
3762; CHECK-LABEL: stack_fold_psrld:
3763; CHECK:       # %bb.0:
3764; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3765; CHECK-NEXT:    #APP
3766; CHECK-NEXT:    nop
3767; CHECK-NEXT:    #NO_APP
3768; CHECK-NEXT:    vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3769; CHECK-NEXT:    retq
3770  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3771  %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
3772  ret <4 x i32> %2
3773}
3774declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
3775
3776define <8 x i32> @stack_fold_psrld_ymm(<8 x i32> %a0, <4 x i32> %a1) {
3777; CHECK-LABEL: stack_fold_psrld_ymm:
3778; CHECK:       # %bb.0:
3779; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3780; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3781; CHECK-NEXT:    #APP
3782; CHECK-NEXT:    nop
3783; CHECK-NEXT:    #NO_APP
3784; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3785; CHECK-NEXT:    vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3786; CHECK-NEXT:    retq
3787  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3788  %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
3789  ret <8 x i32> %2
3790}
3791declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
3792
3793define <16 x i8> @stack_fold_psrldq(<16 x i8> %a) {
3794; CHECK-LABEL: stack_fold_psrldq:
3795; CHECK:       # %bb.0:
3796; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3797; CHECK-NEXT:    #APP
3798; CHECK-NEXT:    nop
3799; CHECK-NEXT:    #NO_APP
3800; CHECK-NEXT:    vpsrldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3801; CHECK-NEXT:    # xmm0 = mem[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3802; CHECK-NEXT:    retq
3803  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3804  %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 29, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
3805  ret <16 x i8> %2
3806}
3807
3808define <32 x i8> @stack_fold_psrldq_ymm(<32 x i8> %a) {
3809; CHECK-LABEL: stack_fold_psrldq_ymm:
3810; CHECK:       # %bb.0:
3811; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3812; CHECK-NEXT:    #APP
3813; CHECK-NEXT:    nop
3814; CHECK-NEXT:    #NO_APP
3815; CHECK-NEXT:    vpsrldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3816; CHECK-NEXT:    # ymm0 = mem[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3817; CHECK-NEXT:    retq
3818  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3819  %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
3820  ret <32 x i8> %2
3821}
3822
3823define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
3824; CHECK-LABEL: stack_fold_psrlq:
3825; CHECK:       # %bb.0:
3826; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3827; CHECK-NEXT:    #APP
3828; CHECK-NEXT:    nop
3829; CHECK-NEXT:    #NO_APP
3830; CHECK-NEXT:    vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3831; CHECK-NEXT:    retq
3832  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3833  %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
3834  ret <2 x i64> %2
3835}
3836declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
3837
3838define <4 x i64> @stack_fold_psrlq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
3839; CHECK-LABEL: stack_fold_psrlq_ymm:
3840; CHECK:       # %bb.0:
3841; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3842; CHECK-NEXT:    #APP
3843; CHECK-NEXT:    nop
3844; CHECK-NEXT:    #NO_APP
3845; CHECK-NEXT:    vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3846; CHECK-NEXT:    retq
3847  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3848  %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
3849  ret <4 x i64> %2
3850}
3851declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
3852
3853define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) {
3854; CHECK-LABEL: stack_fold_psrlvd:
3855; CHECK:       # %bb.0:
3856; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3857; CHECK-NEXT:    #APP
3858; CHECK-NEXT:    nop
3859; CHECK-NEXT:    #NO_APP
3860; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3861; CHECK-NEXT:    retq
3862  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3863  %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
3864  ret <4 x i32> %2
3865}
3866declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
3867
3868define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
3869; CHECK-LABEL: stack_fold_psrlvd_ymm:
3870; CHECK:       # %bb.0:
3871; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3872; CHECK-NEXT:    #APP
3873; CHECK-NEXT:    nop
3874; CHECK-NEXT:    #NO_APP
3875; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3876; CHECK-NEXT:    retq
3877  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3878  %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
3879  ret <8 x i32> %2
3880}
3881declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
3882
3883define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) {
3884; CHECK-LABEL: stack_fold_psrlvq:
3885; CHECK:       # %bb.0:
3886; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3887; CHECK-NEXT:    #APP
3888; CHECK-NEXT:    nop
3889; CHECK-NEXT:    #NO_APP
3890; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3891; CHECK-NEXT:    retq
3892  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3893  %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
3894  ret <2 x i64> %2
3895}
3896declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
3897
3898define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
3899; CHECK-LABEL: stack_fold_psrlvq_ymm:
3900; CHECK:       # %bb.0:
3901; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3902; CHECK-NEXT:    #APP
3903; CHECK-NEXT:    nop
3904; CHECK-NEXT:    #NO_APP
3905; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3906; CHECK-NEXT:    retq
3907  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3908  %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
3909  ret <4 x i64> %2
3910}
3911declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
3912
3913define <8 x i16> @stack_fold_psrlvw(<8 x i16> %a0, <8 x i16> %a1) {
3914; CHECK-LABEL: stack_fold_psrlvw:
3915; CHECK:       # %bb.0:
3916; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3917; CHECK-NEXT:    #APP
3918; CHECK-NEXT:    nop
3919; CHECK-NEXT:    #NO_APP
3920; CHECK-NEXT:    vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3921; CHECK-NEXT:    retq
3922  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3923  %2 = call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %a0, <8 x i16> %a1)
3924  ret <8 x i16> %2
3925}
3926declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) nounwind readnone
3927
3928define <16 x i16> @stack_fold_psrlvw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
3929; CHECK-LABEL: stack_fold_psrlvw_ymm:
3930; CHECK:       # %bb.0:
3931; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3932; CHECK-NEXT:    #APP
3933; CHECK-NEXT:    nop
3934; CHECK-NEXT:    #NO_APP
3935; CHECK-NEXT:    vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3936; CHECK-NEXT:    retq
3937  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3938  %2 = call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %a0, <16 x i16> %a1)
3939  ret <16 x i16> %2
3940}
3941declare <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16>, <16 x i16>) nounwind readnone
3942
3943define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
3944; CHECK-LABEL: stack_fold_psrlw:
3945; CHECK:       # %bb.0:
3946; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3947; CHECK-NEXT:    #APP
3948; CHECK-NEXT:    nop
3949; CHECK-NEXT:    #NO_APP
3950; CHECK-NEXT:    vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3951; CHECK-NEXT:    retq
3952  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3953  %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
3954  ret <8 x i16> %2
3955}
3956declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
3957
3958define <16 x i16> @stack_fold_psrlw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
3959; CHECK-LABEL: stack_fold_psrlw_ymm:
3960; CHECK:       # %bb.0:
3961; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3962; CHECK-NEXT:    #APP
3963; CHECK-NEXT:    nop
3964; CHECK-NEXT:    #NO_APP
3965; CHECK-NEXT:    vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3966; CHECK-NEXT:    retq
3967  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3968  %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
3969  ret <16 x i16> %2
3970}
3971declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
3972
3973define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
3974; CHECK-LABEL: stack_fold_psubb:
3975; CHECK:       # %bb.0:
3976; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3977; CHECK-NEXT:    #APP
3978; CHECK-NEXT:    nop
3979; CHECK-NEXT:    #NO_APP
3980; CHECK-NEXT:    vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3981; CHECK-NEXT:    retq
3982  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3983  %2 = sub <16 x i8> %a0, %a1
3984  ret <16 x i8> %2
3985}
3986
3987define <32 x i8> @stack_fold_psubb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
3988; CHECK-LABEL: stack_fold_psubb_ymm:
3989; CHECK:       # %bb.0:
3990; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3991; CHECK-NEXT:    #APP
3992; CHECK-NEXT:    nop
3993; CHECK-NEXT:    #NO_APP
3994; CHECK-NEXT:    vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3995; CHECK-NEXT:    retq
3996  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3997  %2 = sub <32 x i8> %a0, %a1
3998  ret <32 x i8> %2
3999}
4000
4001define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
4002; CHECK-LABEL: stack_fold_psubd:
4003; CHECK:       # %bb.0:
4004; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4005; CHECK-NEXT:    #APP
4006; CHECK-NEXT:    nop
4007; CHECK-NEXT:    #NO_APP
4008; CHECK-NEXT:    vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4009; CHECK-NEXT:    retq
4010  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4011  %2 = sub <4 x i32> %a0, %a1
4012  ret <4 x i32> %2
4013}
4014
4015define <8 x i32> @stack_fold_psubd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
4016; CHECK-LABEL: stack_fold_psubd_ymm:
4017; CHECK:       # %bb.0:
4018; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4019; CHECK-NEXT:    #APP
4020; CHECK-NEXT:    nop
4021; CHECK-NEXT:    #NO_APP
4022; CHECK-NEXT:    vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4023; CHECK-NEXT:    retq
4024  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4025  %2 = sub <8 x i32> %a0, %a1
4026  ret <8 x i32> %2
4027}
4028
4029define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
4030; CHECK-LABEL: stack_fold_psubq:
4031; CHECK:       # %bb.0:
4032; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4033; CHECK-NEXT:    #APP
4034; CHECK-NEXT:    nop
4035; CHECK-NEXT:    #NO_APP
4036; CHECK-NEXT:    vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4037; CHECK-NEXT:    retq
4038  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4039  %2 = sub <2 x i64> %a0, %a1
4040  ret <2 x i64> %2
4041}
4042
4043define <4 x i64> @stack_fold_psubq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
4044; CHECK-LABEL: stack_fold_psubq_ymm:
4045; CHECK:       # %bb.0:
4046; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4047; CHECK-NEXT:    #APP
4048; CHECK-NEXT:    nop
4049; CHECK-NEXT:    #NO_APP
4050; CHECK-NEXT:    vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4051; CHECK-NEXT:    retq
4052  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4053  %2 = sub <4 x i64> %a0, %a1
4054  ret <4 x i64> %2
4055}
4056
4057define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
4058; CHECK-LABEL: stack_fold_psubsb:
4059; CHECK:       # %bb.0:
4060; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4061; CHECK-NEXT:    #APP
4062; CHECK-NEXT:    nop
4063; CHECK-NEXT:    #NO_APP
4064; CHECK-NEXT:    vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4065; CHECK-NEXT:    retq
4066  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4067  %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
4068  ret <16 x i8> %2
4069}
4070declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
4071
4072define <32 x i8> @stack_fold_psubsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
4073; CHECK-LABEL: stack_fold_psubsb_ymm:
4074; CHECK:       # %bb.0:
4075; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4076; CHECK-NEXT:    #APP
4077; CHECK-NEXT:    nop
4078; CHECK-NEXT:    #NO_APP
4079; CHECK-NEXT:    vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4080; CHECK-NEXT:    retq
4081  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4082  %2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
4083  ret <32 x i8> %2
4084}
4085declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
4086
4087define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
4088; CHECK-LABEL: stack_fold_psubsw:
4089; CHECK:       # %bb.0:
4090; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4091; CHECK-NEXT:    #APP
4092; CHECK-NEXT:    nop
4093; CHECK-NEXT:    #NO_APP
4094; CHECK-NEXT:    vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4095; CHECK-NEXT:    retq
4096  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4097  %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
4098  ret <8 x i16> %2
4099}
4100declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
4101
4102define <16 x i16> @stack_fold_psubsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
4103; CHECK-LABEL: stack_fold_psubsw_ymm:
4104; CHECK:       # %bb.0:
4105; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4106; CHECK-NEXT:    #APP
4107; CHECK-NEXT:    nop
4108; CHECK-NEXT:    #NO_APP
4109; CHECK-NEXT:    vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4110; CHECK-NEXT:    retq
4111  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4112  %2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
4113  ret <16 x i16> %2
4114}
4115declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
4116
4117define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
4118; CHECK-LABEL: stack_fold_psubusb:
4119; CHECK:       # %bb.0:
4120; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4121; CHECK-NEXT:    #APP
4122; CHECK-NEXT:    nop
4123; CHECK-NEXT:    #NO_APP
4124; CHECK-NEXT:    vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4125; CHECK-NEXT:    retq
4126  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4127  %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
4128  ret <16 x i8> %2
4129}
4130declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
4131
4132define <32 x i8> @stack_fold_psubusb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
4133; CHECK-LABEL: stack_fold_psubusb_ymm:
4134; CHECK:       # %bb.0:
4135; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4136; CHECK-NEXT:    #APP
4137; CHECK-NEXT:    nop
4138; CHECK-NEXT:    #NO_APP
4139; CHECK-NEXT:    vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4140; CHECK-NEXT:    retq
4141  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4142  %2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
4143  ret <32 x i8> %2
4144}
4145declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
4146
4147define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
4148; CHECK-LABEL: stack_fold_psubusw:
4149; CHECK:       # %bb.0:
4150; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4151; CHECK-NEXT:    #APP
4152; CHECK-NEXT:    nop
4153; CHECK-NEXT:    #NO_APP
4154; CHECK-NEXT:    vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4155; CHECK-NEXT:    retq
4156  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4157  %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
4158  ret <8 x i16> %2
4159}
4160declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
4161
4162define <16 x i16> @stack_fold_psubusw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
4163; CHECK-LABEL: stack_fold_psubusw_ymm:
4164; CHECK:       # %bb.0:
4165; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4166; CHECK-NEXT:    #APP
4167; CHECK-NEXT:    nop
4168; CHECK-NEXT:    #NO_APP
4169; CHECK-NEXT:    vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4170; CHECK-NEXT:    retq
4171  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4172  %2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
4173  ret <16 x i16> %2
4174}
4175declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
4176
4177define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
4178; CHECK-LABEL: stack_fold_psubw:
4179; CHECK:       # %bb.0:
4180; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4181; CHECK-NEXT:    #APP
4182; CHECK-NEXT:    nop
4183; CHECK-NEXT:    #NO_APP
4184; CHECK-NEXT:    vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4185; CHECK-NEXT:    retq
4186  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4187  %2 = sub <8 x i16> %a0, %a1
4188  ret <8 x i16> %2
4189}
4190
4191define <16 x i16> @stack_fold_psubw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
4192; CHECK-LABEL: stack_fold_psubw_ymm:
4193; CHECK:       # %bb.0:
4194; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4195; CHECK-NEXT:    #APP
4196; CHECK-NEXT:    nop
4197; CHECK-NEXT:    #NO_APP
4198; CHECK-NEXT:    vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4199; CHECK-NEXT:    retq
4200  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4201  %2 = sub <16 x i16> %a0, %a1
4202  ret <16 x i16> %2
4203}
4204
4205define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
4206; CHECK-LABEL: stack_fold_punpckhbw:
4207; CHECK:       # %bb.0:
4208; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4209; CHECK-NEXT:    #APP
4210; CHECK-NEXT:    nop
4211; CHECK-NEXT:    #NO_APP
4212; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4213; CHECK-NEXT:    # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
4214; CHECK-NEXT:    retq
4215  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4216  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
4217  ret <16 x i8> %2
4218}
4219
4220define <16 x i8> @stack_fold_punpckhbw_mask(ptr %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
4221; CHECK-LABEL: stack_fold_punpckhbw_mask:
4222; CHECK:       # %bb.0:
4223; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4224; CHECK-NEXT:    #APP
4225; CHECK-NEXT:    nop
4226; CHECK-NEXT:    #NO_APP
4227; CHECK-NEXT:    kmovd %esi, %k1
4228; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
4229; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
4230; CHECK-NEXT:    # xmm2 {%k1} = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
4231; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
4232; CHECK-NEXT:    retq
4233  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4234  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
4235  %3 = bitcast i16 %mask to <16 x i1>
4236  ; load needed to keep the operation from being scheduled about the asm block
4237  %4 = load <16 x i8>, ptr %passthru
4238  %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4
4239  ret <16 x i8> %5
4240}
4241
4242define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
4243; CHECK-LABEL: stack_fold_punpckhbw_maskz:
4244; CHECK:       # %bb.0:
4245; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4246; CHECK-NEXT:    #APP
4247; CHECK-NEXT:    nop
4248; CHECK-NEXT:    #NO_APP
4249; CHECK-NEXT:    kmovd %edi, %k1
4250; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
4251; CHECK-NEXT:    # xmm0 {%k1} {z} = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
4252; CHECK-NEXT:    retq
4253  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4254  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
4255  %3 = bitcast i16 %mask to <16 x i1>
4256  %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer
4257  ret <16 x i8> %4
4258}
4259
4260define <32 x i8> @stack_fold_punpckhbw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
4261; CHECK-LABEL: stack_fold_punpckhbw_ymm:
4262; CHECK:       # %bb.0:
4263; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4264; CHECK-NEXT:    #APP
4265; CHECK-NEXT:    nop
4266; CHECK-NEXT:    #NO_APP
4267; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4268; CHECK-NEXT:    # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
4269; CHECK-NEXT:    retq
4270  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4271  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
4272  ret <32 x i8> %2
4273}
4274
4275define <32 x i8> @stack_fold_punpckhbw_mask_ymm(ptr %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
4276; CHECK-LABEL: stack_fold_punpckhbw_mask_ymm:
4277; CHECK:       # %bb.0:
4278; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4279; CHECK-NEXT:    #APP
4280; CHECK-NEXT:    nop
4281; CHECK-NEXT:    #NO_APP
4282; CHECK-NEXT:    kmovd %esi, %k1
4283; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
4284; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
4285; CHECK-NEXT:    # ymm2 {%k1} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
4286; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
4287; CHECK-NEXT:    retq
4288  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4289  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
4290  %3 = bitcast i32 %mask to <32 x i1>
4291  ; load needed to keep the operation from being scheduled about the asm block
4292  %4 = load <32 x i8>, ptr %passthru
4293  %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
4294  ret <32 x i8> %5
4295}
4296
4297define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
4298; CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm:
4299; CHECK:       # %bb.0:
4300; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4301; CHECK-NEXT:    #APP
4302; CHECK-NEXT:    nop
4303; CHECK-NEXT:    #NO_APP
4304; CHECK-NEXT:    kmovd %edi, %k1
4305; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
4306; CHECK-NEXT:    # ymm0 {%k1} {z} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
4307; CHECK-NEXT:    retq
4308  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4309  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
4310  %3 = bitcast i32 %mask to <32 x i1>
4311  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
4312  ret <32 x i8> %4
4313}
4314
4315define <4 x i64> @stack_fold_shufi64x2_maskz(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
4316; CHECK-LABEL: stack_fold_shufi64x2_maskz:
4317; CHECK:       # %bb.0:
4318; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4319; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4320; CHECK-NEXT:    #APP
4321; CHECK-NEXT:    nop
4322; CHECK-NEXT:    #NO_APP
4323; CHECK-NEXT:    kmovd %edi, %k1
4324; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4325; CHECK-NEXT:    vshufi64x2 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
4326; CHECK-NEXT:    # ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
4327; CHECK-NEXT:    retq
4328  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4329  %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
4330  %3 = bitcast i8 %mask to <8 x i1>
4331  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4332  %5 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer
4333  ret <4 x i64> %5
4334}
4335
4336define <8 x i32> @stack_fold_shufi32x4_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
4337; CHECK-LABEL: stack_fold_shufi32x4_maskz:
4338; CHECK:       # %bb.0:
4339; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4340; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4341; CHECK-NEXT:    #APP
4342; CHECK-NEXT:    nop
4343; CHECK-NEXT:    #NO_APP
4344; CHECK-NEXT:    kmovd %edi, %k1
4345; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4346; CHECK-NEXT:    vshufi32x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
4347; CHECK-NEXT:    # ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
4348; CHECK-NEXT:    retq
4349  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4350  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
4351  %3 = bitcast i8 %mask to <8 x i1>
4352  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
4353  ret <8 x i32> %4
4354}
4355
4356declare <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32>)
4357declare <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32>)
4358declare <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64>)
4359declare <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64>)
4360declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
4361declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
4362declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
4363declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1)
4364