xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll (revision aa02b76b1a4c2df1a701fbd8a8378d1cd946e70a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) {
13; CHECK-LABEL: stack_fold_valignd:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    pushq %rax
16; CHECK-NEXT:    .cfi_def_cfa_offset 16
17; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19; CHECK-NEXT:    #APP
20; CHECK-NEXT:    nop
21; CHECK-NEXT:    #NO_APP
22; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
24; CHECK-NEXT:    # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
25; CHECK-NEXT:    popq %rax
26; CHECK-NEXT:    .cfi_def_cfa_offset 8
27; CHECK-NEXT:    retq
28  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
29  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
30  ret <16 x i32> %2
31}
32
33define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, ptr %passthru, i16 %mask) {
34; CHECK-LABEL: stack_fold_valignd_mask:
35; CHECK:       # %bb.0:
36; CHECK-NEXT:    pushq %rax
37; CHECK-NEXT:    .cfi_def_cfa_offset 16
38; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
39; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
40; CHECK-NEXT:    #APP
41; CHECK-NEXT:    nop
42; CHECK-NEXT:    #NO_APP
43; CHECK-NEXT:    kmovd %esi, %k1
44; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
45; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
46; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
47; CHECK-NEXT:    # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
48; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
49; CHECK-NEXT:    popq %rax
50; CHECK-NEXT:    .cfi_def_cfa_offset 8
51; CHECK-NEXT:    retq
52  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
53  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
54  %3 = bitcast i16 %mask to <16 x i1>
55  %4 = load <16 x i32>, ptr %passthru
56  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
57  ret <16 x i32> %5
58}
59
60define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
61; CHECK-LABEL: stack_fold_valignd_maskz:
62; CHECK:       # %bb.0:
63; CHECK-NEXT:    pushq %rax
64; CHECK-NEXT:    .cfi_def_cfa_offset 16
65; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
66; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
67; CHECK-NEXT:    #APP
68; CHECK-NEXT:    nop
69; CHECK-NEXT:    #NO_APP
70; CHECK-NEXT:    kmovd %edi, %k1
71; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
72; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
73; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
74; CHECK-NEXT:    popq %rax
75; CHECK-NEXT:    .cfi_def_cfa_offset 8
76; CHECK-NEXT:    retq
77  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
78  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
79  %3 = bitcast i16 %mask to <16 x i1>
80  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
81  ret <16 x i32> %4
82}
83
84define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) {
85; CHECK-LABEL: stack_fold_valignq:
86; CHECK:       # %bb.0:
87; CHECK-NEXT:    pushq %rax
88; CHECK-NEXT:    .cfi_def_cfa_offset 16
89; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
90; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
91; CHECK-NEXT:    #APP
92; CHECK-NEXT:    nop
93; CHECK-NEXT:    #NO_APP
94; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
95; CHECK-NEXT:    valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
96; CHECK-NEXT:    # zmm0 = mem[1,2,3,4,5,6,7],zmm0[0]
97; CHECK-NEXT:    popq %rax
98; CHECK-NEXT:    .cfi_def_cfa_offset 8
99; CHECK-NEXT:    retq
100  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
101  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
102  ret <8 x i64> %2
103}
104
105define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, ptr %passthru, i8 %mask) {
106; CHECK-LABEL: stack_fold_valignq_mask:
107; CHECK:       # %bb.0:
108; CHECK-NEXT:    pushq %rax
109; CHECK-NEXT:    .cfi_def_cfa_offset 16
110; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
111; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
112; CHECK-NEXT:    #APP
113; CHECK-NEXT:    nop
114; CHECK-NEXT:    #NO_APP
115; CHECK-NEXT:    kmovd %esi, %k1
116; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
117; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
118; CHECK-NEXT:    valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
119; CHECK-NEXT:    # zmm1 {%k1} = mem[1,2,3,4,5,6,7],zmm0[0]
120; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
121; CHECK-NEXT:    popq %rax
122; CHECK-NEXT:    .cfi_def_cfa_offset 8
123; CHECK-NEXT:    retq
124  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
125  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
126  %3 = bitcast i8 %mask to <8 x i1>
127  %4 = load <8 x i64>, ptr %passthru
128  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
129  ret <8 x i64> %5
130}
131
132define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
133; CHECK-LABEL: stack_fold_valignq_maskz:
134; CHECK:       # %bb.0:
135; CHECK-NEXT:    pushq %rax
136; CHECK-NEXT:    .cfi_def_cfa_offset 16
137; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
138; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
139; CHECK-NEXT:    #APP
140; CHECK-NEXT:    nop
141; CHECK-NEXT:    #NO_APP
142; CHECK-NEXT:    kmovd %edi, %k1
143; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
144; CHECK-NEXT:    valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
145; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7],zmm0[0]
146; CHECK-NEXT:    popq %rax
147; CHECK-NEXT:    .cfi_def_cfa_offset 8
148; CHECK-NEXT:    retq
149  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
150  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
151  %3 = bitcast i8 %mask to <8 x i1>
152  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
153  ret <8 x i64> %4
154}
155
156define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) {
157; CHECK-LABEL: stack_fold_pavgb:
158; CHECK:       # %bb.0:
159; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
160; CHECK-NEXT:    #APP
161; CHECK-NEXT:    nop
162; CHECK-NEXT:    #NO_APP
163; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
164; CHECK-NEXT:    retq
165  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
166  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
167  ret <64 x i8> %2
168}
169declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>)
170
171define <64 x i8> @stack_fold_pavgb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
172; CHECK-LABEL: stack_fold_pavgb_commuted:
173; CHECK:       # %bb.0:
174; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
175; CHECK-NEXT:    #APP
176; CHECK-NEXT:    nop
177; CHECK-NEXT:    #NO_APP
178; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
179; CHECK-NEXT:    retq
180  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
181  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
182  ret <64 x i8> %2
183}
184
185define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
186; CHECK-LABEL: stack_fold_pavgb_mask:
187; CHECK:       # %bb.0:
188; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
189; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
190; CHECK-NEXT:    #APP
191; CHECK-NEXT:    nop
192; CHECK-NEXT:    #NO_APP
193; CHECK-NEXT:    kmovq %rsi, %k1
194; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
195; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
196; CHECK-NEXT:    retq
197  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
198  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
199  %3 = bitcast i64 %mask to <64 x i1>
200  ; load needed to keep the operation from being scheduled about the asm block
201  %4 = load <64 x i8>, ptr %a2
202  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
203  ret <64 x i8> %5
204}
205
206define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
207; CHECK-LABEL: stack_fold_pavgb_mask_commuted:
208; CHECK:       # %bb.0:
209; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
210; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
211; CHECK-NEXT:    #APP
212; CHECK-NEXT:    nop
213; CHECK-NEXT:    #NO_APP
214; CHECK-NEXT:    kmovq %rsi, %k1
215; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
216; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
217; CHECK-NEXT:    retq
218  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
219  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
220  %3 = bitcast i64 %mask to <64 x i1>
221  ; load needed to keep the operation from being scheduled about the asm block
222  %4 = load <64 x i8>, ptr %a2
223  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
224  ret <64 x i8> %5
225}
226
227define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
228; CHECK-LABEL: stack_fold_pavgb_maskz:
229; CHECK:       # %bb.0:
230; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
231; CHECK-NEXT:    #APP
232; CHECK-NEXT:    nop
233; CHECK-NEXT:    #NO_APP
234; CHECK-NEXT:    kmovq %rdi, %k1
235; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
236; CHECK-NEXT:    retq
237  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
238  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
239  %3 = bitcast i64 %mask to <64 x i1>
240  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
241  ret <64 x i8> %4
242}
243
244define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
245; CHECK-LABEL: stack_fold_pavgb_maskz_commuted:
246; CHECK:       # %bb.0:
247; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
248; CHECK-NEXT:    #APP
249; CHECK-NEXT:    nop
250; CHECK-NEXT:    #NO_APP
251; CHECK-NEXT:    kmovq %rdi, %k1
252; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
253; CHECK-NEXT:    retq
254  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
255  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
256  %3 = bitcast i64 %mask to <64 x i1>
257  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
258  ret <64 x i8> %4
259}
260
261define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) {
262; CHECK-LABEL: stack_fold_pavgw:
263; CHECK:       # %bb.0:
264; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
265; CHECK-NEXT:    #APP
266; CHECK-NEXT:    nop
267; CHECK-NEXT:    #NO_APP
268; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
269; CHECK-NEXT:    retq
270  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
271  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
272  ret <32 x i16> %2
273}
274declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>)
275
276define <32 x i16> @stack_fold_pavgw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
277; CHECK-LABEL: stack_fold_pavgw_commuted:
278; CHECK:       # %bb.0:
279; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
280; CHECK-NEXT:    #APP
281; CHECK-NEXT:    nop
282; CHECK-NEXT:    #NO_APP
283; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
284; CHECK-NEXT:    retq
285  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
286  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
287  ret <32 x i16> %2
288}
289
290define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
291; CHECK-LABEL: stack_fold_pavgw_mask:
292; CHECK:       # %bb.0:
293; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
294; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
295; CHECK-NEXT:    #APP
296; CHECK-NEXT:    nop
297; CHECK-NEXT:    #NO_APP
298; CHECK-NEXT:    kmovd %esi, %k1
299; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
300; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
301; CHECK-NEXT:    retq
302  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
303  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
304  %3 = bitcast i32 %mask to <32 x i1>
305  ; load needed to keep the operation from being scheduled about the asm block
306  %4 = load <32 x i16>, ptr %a2
307  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
308  ret <32 x i16> %5
309}
310
311define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
312; CHECK-LABEL: stack_fold_pavgw_mask_commuted:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
315; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
316; CHECK-NEXT:    #APP
317; CHECK-NEXT:    nop
318; CHECK-NEXT:    #NO_APP
319; CHECK-NEXT:    kmovd %esi, %k1
320; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
321; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
322; CHECK-NEXT:    retq
323  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
324  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
325  %3 = bitcast i32 %mask to <32 x i1>
326  ; load needed to keep the operation from being scheduled about the asm block
327  %4 = load <32 x i16>, ptr %a2
328  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
329  ret <32 x i16> %5
330}
331
332define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
333; CHECK-LABEL: stack_fold_pavgw_maskz:
334; CHECK:       # %bb.0:
335; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
336; CHECK-NEXT:    #APP
337; CHECK-NEXT:    nop
338; CHECK-NEXT:    #NO_APP
339; CHECK-NEXT:    kmovd %edi, %k1
340; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
341; CHECK-NEXT:    retq
342  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
343  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
344  %3 = bitcast i32 %mask to <32 x i1>
345  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
346  ret <32 x i16> %4
347}
348
349define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
350; CHECK-LABEL: stack_fold_pavgw_maskz_commuted:
351; CHECK:       # %bb.0:
352; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
353; CHECK-NEXT:    #APP
354; CHECK-NEXT:    nop
355; CHECK-NEXT:    #NO_APP
356; CHECK-NEXT:    kmovd %edi, %k1
357; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
358; CHECK-NEXT:    retq
359  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
360  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
361  %3 = bitcast i32 %mask to <32 x i1>
362  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
363  ret <32 x i16> %4
364}
365
366define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) {
367; CHECK-LABEL: stack_fold_extracti32x4:
368; CHECK:       # %bb.0:
369; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
370; CHECK-NEXT:    vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
371; CHECK-NEXT:    #APP
372; CHECK-NEXT:    nop
373; CHECK-NEXT:    #NO_APP
374; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
375; CHECK-NEXT:    vzeroupper
376; CHECK-NEXT:    retq
377  ; zext forces execution domain
378  %1 = zext <16 x i16> %a0 to <16 x i32>
379  %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
380  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
381  ret <4 x i32> %2
382}
383
384define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) {
385; CHECK-LABEL: stack_fold_extracti64x2:
386; CHECK:       # %bb.0:
387; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
388; CHECK-NEXT:    vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
389; CHECK-NEXT:    #APP
390; CHECK-NEXT:    nop
391; CHECK-NEXT:    #NO_APP
392; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
393; CHECK-NEXT:    vzeroupper
394; CHECK-NEXT:    retq
395  ; zext forces execution domain
396  %1 = zext <8 x i32> %a0 to <8 x i64>
397  %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
398  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
399  ret <2 x i64> %2
400}
401
402define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) {
403; CHECK-LABEL: stack_fold_extracti32x8:
404; CHECK:       # %bb.0:
405; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
406; CHECK-NEXT:    vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
407; CHECK-NEXT:    #APP
408; CHECK-NEXT:    nop
409; CHECK-NEXT:    #NO_APP
410; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
411; CHECK-NEXT:    retq
412  ; zext forces execution domain
413  %1 = zext <16 x i16> %a0 to <16 x i32>
414  %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
415  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
416  ret <8 x i32> %2
417}
418
419define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) {
420; CHECK-LABEL: stack_fold_extracti64x4:
421; CHECK:       # %bb.0:
422; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
423; CHECK-NEXT:    vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
424; CHECK-NEXT:    #APP
425; CHECK-NEXT:    nop
426; CHECK-NEXT:    #NO_APP
427; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
428; CHECK-NEXT:    retq
429  ; zext forces execution domain
430  %1 = zext <8 x i32> %a0 to <8 x i64>
431  %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
432  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
433  ret <4 x i64> %2
434}
435
436define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
437; CHECK-LABEL: stack_fold_inserti32x8:
438; CHECK:       # %bb.0:
439; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
440; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
441; CHECK-NEXT:    #APP
442; CHECK-NEXT:    nop
443; CHECK-NEXT:    #NO_APP
444; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
445; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
446; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
447; CHECK-NEXT:    retq
448  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
449  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
450  ; add forces execution domain
451  %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
452  ret <16 x i32> %3
453}
454
455define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) {
456; CHECK-LABEL: stack_fold_inserti64x4:
457; CHECK:       # %bb.0:
458; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
459; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
460; CHECK-NEXT:    #APP
461; CHECK-NEXT:    nop
462; CHECK-NEXT:    #NO_APP
463; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
464; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
465; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
466; CHECK-NEXT:    retq
467  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
468  %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
469  ; add forces execution domain
470  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
471  ret <8 x i64> %3
472}
473
474define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) {
475; CHECK-LABEL: stack_fold_pabsb:
476; CHECK:       # %bb.0:
477; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
478; CHECK-NEXT:    #APP
479; CHECK-NEXT:    nop
480; CHECK-NEXT:    #NO_APP
481; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
482; CHECK-NEXT:    retq
483  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
484  %2 = icmp sgt <64 x i8> %a0, zeroinitializer
485  %3 = sub <64 x i8> zeroinitializer, %a0
486  %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
487  ret <64 x i8> %4
488}
489
490define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) {
491; CHECK-LABEL: stack_fold_pabsb_mask:
492; CHECK:       # %bb.0:
493; CHECK-NEXT:    pushq %rax
494; CHECK-NEXT:    .cfi_def_cfa_offset 16
495; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
496; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
497; CHECK-NEXT:    #APP
498; CHECK-NEXT:    nop
499; CHECK-NEXT:    #NO_APP
500; CHECK-NEXT:    kmovq %rdi, %k1
501; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
502; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
503; CHECK-NEXT:    popq %rax
504; CHECK-NEXT:    .cfi_def_cfa_offset 8
505; CHECK-NEXT:    retq
506  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
507  %2 = icmp sgt <64 x i8> %a0, zeroinitializer
508  %3 = sub <64 x i8> zeroinitializer, %a0
509  %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
510  %5 = bitcast i64 %mask to <64 x i1>
511  %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> %passthru
512  ret <64 x i8> %6
513}
514
515define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) {
516; CHECK-LABEL: stack_fold_pabsb_maskz:
517; CHECK:       # %bb.0:
518; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
519; CHECK-NEXT:    #APP
520; CHECK-NEXT:    nop
521; CHECK-NEXT:    #NO_APP
522; CHECK-NEXT:    kmovq %rdi, %k1
523; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
524; CHECK-NEXT:    retq
525  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
526  %2 = icmp sgt <64 x i8> %a0, zeroinitializer
527  %3 = sub <64 x i8> zeroinitializer, %a0
528  %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
529  %5 = bitcast i64 %mask to <64 x i1>
530  %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer
531  ret <64 x i8> %6
532}
533
534define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) {
535; CHECK-LABEL: stack_fold_pabsd:
536; CHECK:       # %bb.0:
537; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
538; CHECK-NEXT:    #APP
539; CHECK-NEXT:    nop
540; CHECK-NEXT:    #NO_APP
541; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
542; CHECK-NEXT:    retq
543  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
544  %2 = icmp sgt <16 x i32> %a0, zeroinitializer
545  %3 = sub <16 x i32> zeroinitializer, %a0
546  %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
547  ret <16 x i32> %4
548}
549
550define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
551; CHECK-LABEL: stack_fold_pabsd_mask:
552; CHECK:       # %bb.0:
553; CHECK-NEXT:    pushq %rax
554; CHECK-NEXT:    .cfi_def_cfa_offset 16
555; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
556; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
557; CHECK-NEXT:    #APP
558; CHECK-NEXT:    nop
559; CHECK-NEXT:    #NO_APP
560; CHECK-NEXT:    kmovd %edi, %k1
561; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
562; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
563; CHECK-NEXT:    popq %rax
564; CHECK-NEXT:    .cfi_def_cfa_offset 8
565; CHECK-NEXT:    retq
566  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
567  %2 = icmp sgt <16 x i32> %a0, zeroinitializer
568  %3 = sub <16 x i32> zeroinitializer, %a0
569  %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
570  %5 = bitcast i16 %mask to <16 x i1>
571  %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> %passthru
572  ret <16 x i32> %6
573}
574
575define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) {
576; CHECK-LABEL: stack_fold_pabsd_maskz:
577; CHECK:       # %bb.0:
578; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
579; CHECK-NEXT:    #APP
580; CHECK-NEXT:    nop
581; CHECK-NEXT:    #NO_APP
582; CHECK-NEXT:    kmovd %edi, %k1
583; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
584; CHECK-NEXT:    retq
585  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
586  %2 = icmp sgt <16 x i32> %a0, zeroinitializer
587  %3 = sub <16 x i32> zeroinitializer, %a0
588  %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
589  %5 = bitcast i16 %mask to <16 x i1>
590  %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
591  ret <16 x i32> %6
592}
593
594define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) {
595; CHECK-LABEL: stack_fold_pabsq:
596; CHECK:       # %bb.0:
597; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
598; CHECK-NEXT:    #APP
599; CHECK-NEXT:    nop
600; CHECK-NEXT:    #NO_APP
601; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
602; CHECK-NEXT:    retq
603  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
604  %2 = icmp sgt <8 x i64> %a0, zeroinitializer
605  %3 = sub <8 x i64> zeroinitializer, %a0
606  %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
607  ret <8 x i64> %4
608}
609
610define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) {
611; CHECK-LABEL: stack_fold_pabsq_mask:
612; CHECK:       # %bb.0:
613; CHECK-NEXT:    pushq %rax
614; CHECK-NEXT:    .cfi_def_cfa_offset 16
615; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
616; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
617; CHECK-NEXT:    #APP
618; CHECK-NEXT:    nop
619; CHECK-NEXT:    #NO_APP
620; CHECK-NEXT:    kmovd %edi, %k1
621; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
622; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
623; CHECK-NEXT:    popq %rax
624; CHECK-NEXT:    .cfi_def_cfa_offset 8
625; CHECK-NEXT:    retq
626  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
627  %2 = icmp sgt <8 x i64> %a0, zeroinitializer
628  %3 = sub <8 x i64> zeroinitializer, %a0
629  %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
630  %5 = bitcast i8 %mask to <8 x i1>
631  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %passthru
632  ret <8 x i64> %6
633}
634
635define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) {
636; CHECK-LABEL: stack_fold_pabsq_maskz:
637; CHECK:       # %bb.0:
638; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
639; CHECK-NEXT:    #APP
640; CHECK-NEXT:    nop
641; CHECK-NEXT:    #NO_APP
642; CHECK-NEXT:    kmovd %edi, %k1
643; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
644; CHECK-NEXT:    retq
645  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
646  %2 = icmp sgt <8 x i64> %a0, zeroinitializer
647  %3 = sub <8 x i64> zeroinitializer, %a0
648  %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
649  %5 = bitcast i8 %mask to <8 x i1>
650  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
651  ret <8 x i64> %6
652}
653
654define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) {
655; CHECK-LABEL: stack_fold_pabsw:
656; CHECK:       # %bb.0:
657; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
658; CHECK-NEXT:    #APP
659; CHECK-NEXT:    nop
660; CHECK-NEXT:    #NO_APP
661; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
662; CHECK-NEXT:    retq
663  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
664  %2 = icmp sgt <32 x i16> %a0, zeroinitializer
665  %3 = sub <32 x i16> zeroinitializer, %a0
666  %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
667  ret <32 x i16> %4
668}
669
670define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
671; CHECK-LABEL: stack_fold_pabsw_mask:
672; CHECK:       # %bb.0:
673; CHECK-NEXT:    pushq %rax
674; CHECK-NEXT:    .cfi_def_cfa_offset 16
675; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
676; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
677; CHECK-NEXT:    #APP
678; CHECK-NEXT:    nop
679; CHECK-NEXT:    #NO_APP
680; CHECK-NEXT:    kmovd %edi, %k1
681; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
682; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
683; CHECK-NEXT:    popq %rax
684; CHECK-NEXT:    .cfi_def_cfa_offset 8
685; CHECK-NEXT:    retq
686  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
687  %2 = icmp sgt <32 x i16> %a0, zeroinitializer
688  %3 = sub <32 x i16> zeroinitializer, %a0
689  %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
690  %5 = bitcast i32 %mask to <32 x i1>
691  %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> %passthru
692  ret <32 x i16> %6
693}
694
695define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) {
696; CHECK-LABEL: stack_fold_pabsw_maskz:
697; CHECK:       # %bb.0:
698; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
699; CHECK-NEXT:    #APP
700; CHECK-NEXT:    nop
701; CHECK-NEXT:    #NO_APP
702; CHECK-NEXT:    kmovd %edi, %k1
703; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
704; CHECK-NEXT:    retq
705  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
706  %2 = icmp sgt <32 x i16> %a0, zeroinitializer
707  %3 = sub <32 x i16> zeroinitializer, %a0
708  %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
709  %5 = bitcast i32 %mask to <32 x i1>
710  %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> zeroinitializer
711  ret <32 x i16> %6
712}
713
714define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) {
715; CHECK-LABEL: stack_fold_packssdw:
716; CHECK:       # %bb.0:
717; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
718; CHECK-NEXT:    #APP
719; CHECK-NEXT:    nop
720; CHECK-NEXT:    #NO_APP
721; CHECK-NEXT:    vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
722; CHECK-NEXT:    retq
723  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
724  %2 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a0, <16 x i32> %a1)
725  ret <32 x i16> %2
726}
727declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
728
729define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) {
730; CHECK-LABEL: stack_fold_packsswb:
731; CHECK:       # %bb.0:
732; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
733; CHECK-NEXT:    #APP
734; CHECK-NEXT:    nop
735; CHECK-NEXT:    #NO_APP
736; CHECK-NEXT:    vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
737; CHECK-NEXT:    retq
738  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
739  %2 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a0, <32 x i16> %a1)
740  ret <64 x i8> %2
741}
742declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
743
744define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) {
745; CHECK-LABEL: stack_fold_packusdw:
746; CHECK:       # %bb.0:
747; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
748; CHECK-NEXT:    #APP
749; CHECK-NEXT:    nop
750; CHECK-NEXT:    #NO_APP
751; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
752; CHECK-NEXT:    retq
753  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
754  %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
755  ret <32 x i16> %2
756}
757declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
758
759define <32 x i16> @stack_fold_packusdw_mask(ptr %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
760; CHECK-LABEL: stack_fold_packusdw_mask:
761; CHECK:       # %bb.0:
762; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
763; CHECK-NEXT:    #APP
764; CHECK-NEXT:    nop
765; CHECK-NEXT:    #NO_APP
766; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
767; CHECK-NEXT:    kmovd %esi, %k1
768; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
769; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
770; CHECK-NEXT:    retq
771  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
772  %2 = load <32 x i16>, ptr %passthru
773  %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
774  %4 = bitcast i32 %mask to <32 x i1>
775  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %2
776  ret <32 x i16> %5
777}
778
779define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
780; CHECK-LABEL: stack_fold_packusdw_maskz:
781; CHECK:       # %bb.0:
782; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
783; CHECK-NEXT:    #APP
784; CHECK-NEXT:    nop
785; CHECK-NEXT:    #NO_APP
786; CHECK-NEXT:    kmovd %edi, %k1
787; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
788; CHECK-NEXT:    retq
789  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
790  %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
791  %3 = bitcast i32 %mask to <32 x i1>
792  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
793  ret <32 x i16> %4
794}
795
796define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) {
797; CHECK-LABEL: stack_fold_packuswb:
798; CHECK:       # %bb.0:
799; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
800; CHECK-NEXT:    #APP
801; CHECK-NEXT:    nop
802; CHECK-NEXT:    #NO_APP
803; CHECK-NEXT:    vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
804; CHECK-NEXT:    retq
805  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
806  %2 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a0, <32 x i16> %a1)
807  ret <64 x i8> %2
808}
809declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
810
811define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) {
812; CHECK-LABEL: stack_fold_paddb:
813; CHECK:       # %bb.0:
814; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
815; CHECK-NEXT:    #APP
816; CHECK-NEXT:    nop
817; CHECK-NEXT:    #NO_APP
818; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
819; CHECK-NEXT:    retq
820  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
821  %2 = add <64 x i8> %a0, %a1
822  ret <64 x i8> %2
823}
824
825define <64 x i8> @stack_fold_paddb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
826; CHECK-LABEL: stack_fold_paddb_commuted:
827; CHECK:       # %bb.0:
828; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
829; CHECK-NEXT:    #APP
830; CHECK-NEXT:    nop
831; CHECK-NEXT:    #NO_APP
832; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
833; CHECK-NEXT:    retq
834  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
835  %2 = add <64 x i8> %a1, %a0
836  ret <64 x i8> %2
837}
838
839define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
840; CHECK-LABEL: stack_fold_paddb_mask:
841; CHECK:       # %bb.0:
842; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
843; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
844; CHECK-NEXT:    #APP
845; CHECK-NEXT:    nop
846; CHECK-NEXT:    #NO_APP
847; CHECK-NEXT:    kmovq %rsi, %k1
848; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
849; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
850; CHECK-NEXT:    retq
851  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
852  %2 = add <64 x i8> %a0, %a1
853  %3 = bitcast i64 %mask to <64 x i1>
854  ; load needed to keep the operation from being scheduled about the asm block
855  %4 = load <64 x i8>, ptr %a2
856  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
857  ret <64 x i8> %5
858}
859
860define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
861; CHECK-LABEL: stack_fold_paddb_mask_commuted:
862; CHECK:       # %bb.0:
863; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
864; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
865; CHECK-NEXT:    #APP
866; CHECK-NEXT:    nop
867; CHECK-NEXT:    #NO_APP
868; CHECK-NEXT:    kmovq %rsi, %k1
869; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
870; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
871; CHECK-NEXT:    retq
872  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
873  %2 = add <64 x i8> %a1, %a0
874  %3 = bitcast i64 %mask to <64 x i1>
875  ; load needed to keep the operation from being scheduled about the asm block
876  %4 = load <64 x i8>, ptr %a2
877  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
878  ret <64 x i8> %5
879}
880
881define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
882; CHECK-LABEL: stack_fold_paddb_maskz:
883; CHECK:       # %bb.0:
884; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
885; CHECK-NEXT:    #APP
886; CHECK-NEXT:    nop
887; CHECK-NEXT:    #NO_APP
888; CHECK-NEXT:    kmovq %rdi, %k1
889; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
890; CHECK-NEXT:    retq
891  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
892  %2 = add <64 x i8> %a0, %a1
893  %3 = bitcast i64 %mask to <64 x i1>
894  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
895  ret <64 x i8> %4
896}
897
898define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
899; CHECK-LABEL: stack_fold_paddb_maskz_commuted:
900; CHECK:       # %bb.0:
901; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
902; CHECK-NEXT:    #APP
903; CHECK-NEXT:    nop
904; CHECK-NEXT:    #NO_APP
905; CHECK-NEXT:    kmovq %rdi, %k1
906; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
907; CHECK-NEXT:    retq
908  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
909  %2 = add <64 x i8> %a1, %a0
910  %3 = bitcast i64 %mask to <64 x i1>
911  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
912  ret <64 x i8> %4
913}
914
915define <16 x i32> @stack_fold_paddd(<16 x i32> %a0, <16 x i32> %a1) {
916; CHECK-LABEL: stack_fold_paddd:
917; CHECK:       # %bb.0:
918; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
919; CHECK-NEXT:    #APP
920; CHECK-NEXT:    nop
921; CHECK-NEXT:    #NO_APP
922; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
923; CHECK-NEXT:    retq
924  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
925  %2 = add <16 x i32> %a0, %a1
926  ret <16 x i32> %2
927}
928
929define <16 x i32> @stack_fold_paddd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
930; CHECK-LABEL: stack_fold_paddd_commuted:
931; CHECK:       # %bb.0:
932; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
933; CHECK-NEXT:    #APP
934; CHECK-NEXT:    nop
935; CHECK-NEXT:    #NO_APP
936; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
937; CHECK-NEXT:    retq
938  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
939  %2 = add <16 x i32> %a1, %a0
940  ret <16 x i32> %2
941}
942
943define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
944; CHECK-LABEL: stack_fold_paddd_mask:
945; CHECK:       # %bb.0:
946; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
947; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
948; CHECK-NEXT:    #APP
949; CHECK-NEXT:    nop
950; CHECK-NEXT:    #NO_APP
951; CHECK-NEXT:    kmovd %esi, %k1
952; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
953; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
954; CHECK-NEXT:    retq
955  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
956  %2 = add <16 x i32> %a0, %a1
957  %3 = bitcast i16 %mask to <16 x i1>
958  ; load needed to keep the operation from being scheduled about the asm block
959  %4 = load <16 x i32>, ptr %a2
960  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
961  ret <16 x i32> %5
962}
963
964define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
965; CHECK-LABEL: stack_fold_paddd_mask_commuted:
966; CHECK:       # %bb.0:
967; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
968; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
969; CHECK-NEXT:    #APP
970; CHECK-NEXT:    nop
971; CHECK-NEXT:    #NO_APP
972; CHECK-NEXT:    kmovd %esi, %k1
973; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
974; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
975; CHECK-NEXT:    retq
976  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
977  %2 = add <16 x i32> %a1, %a0
978  %3 = bitcast i16 %mask to <16 x i1>
979  ; load needed to keep the operation from being scheduled about the asm block
980  %4 = load <16 x i32>, ptr %a2
981  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
982  ret <16 x i32> %5
983}
984
985define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
986; CHECK-LABEL: stack_fold_paddd_maskz:
987; CHECK:       # %bb.0:
988; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
989; CHECK-NEXT:    #APP
990; CHECK-NEXT:    nop
991; CHECK-NEXT:    #NO_APP
992; CHECK-NEXT:    kmovd %edi, %k1
993; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
994; CHECK-NEXT:    retq
995  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
996  %2 = add <16 x i32> %a0, %a1
997  %3 = bitcast i16 %mask to <16 x i1>
998  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
999  ret <16 x i32> %4
1000}
1001
1002define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1003; CHECK-LABEL: stack_fold_paddd_maskz_commuted:
1004; CHECK:       # %bb.0:
1005; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1006; CHECK-NEXT:    #APP
1007; CHECK-NEXT:    nop
1008; CHECK-NEXT:    #NO_APP
1009; CHECK-NEXT:    kmovd %edi, %k1
1010; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1011; CHECK-NEXT:    retq
1012  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1013  %2 = add <16 x i32> %a1, %a0
1014  %3 = bitcast i16 %mask to <16 x i1>
1015  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1016  ret <16 x i32> %4
1017}
1018
1019define <8 x i64> @stack_fold_paddq(<8 x i64> %a0, <8 x i64> %a1) {
1020; CHECK-LABEL: stack_fold_paddq:
1021; CHECK:       # %bb.0:
1022; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1023; CHECK-NEXT:    #APP
1024; CHECK-NEXT:    nop
1025; CHECK-NEXT:    #NO_APP
1026; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1027; CHECK-NEXT:    retq
1028  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1029  %2 = add <8 x i64> %a0, %a1
1030  ret <8 x i64> %2
1031}
1032
1033define <8 x i64> @stack_fold_paddq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
1034; CHECK-LABEL: stack_fold_paddq_commuted:
1035; CHECK:       # %bb.0:
1036; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1037; CHECK-NEXT:    #APP
1038; CHECK-NEXT:    nop
1039; CHECK-NEXT:    #NO_APP
1040; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1041; CHECK-NEXT:    retq
1042  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1043  %2 = add <8 x i64> %a1, %a0
1044  ret <8 x i64> %2
1045}
1046
1047define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
1048; CHECK-LABEL: stack_fold_paddq_mask:
1049; CHECK:       # %bb.0:
1050; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1051; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1052; CHECK-NEXT:    #APP
1053; CHECK-NEXT:    nop
1054; CHECK-NEXT:    #NO_APP
1055; CHECK-NEXT:    kmovd %esi, %k1
1056; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1057; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1058; CHECK-NEXT:    retq
1059  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1060  %2 = add <8 x i64> %a0, %a1
1061  %3 = bitcast i8 %mask to <8 x i1>
1062  ; load needed to keep the operation from being scheduled about the asm block
1063  %4 = load <8 x i64>, ptr %a2
1064  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1065  ret <8 x i64> %5
1066}
1067
1068define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
1069; CHECK-LABEL: stack_fold_paddq_mask_commuted:
1070; CHECK:       # %bb.0:
1071; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1072; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1073; CHECK-NEXT:    #APP
1074; CHECK-NEXT:    nop
1075; CHECK-NEXT:    #NO_APP
1076; CHECK-NEXT:    kmovd %esi, %k1
1077; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1078; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1079; CHECK-NEXT:    retq
1080  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1081  %2 = add <8 x i64> %a1, %a0
1082  %3 = bitcast i8 %mask to <8 x i1>
1083  ; load needed to keep the operation from being scheduled about the asm block
1084  %4 = load <8 x i64>, ptr %a2
1085  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1086  ret <8 x i64> %5
1087}
1088
1089define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1090; CHECK-LABEL: stack_fold_paddq_maskz:
1091; CHECK:       # %bb.0:
1092; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1093; CHECK-NEXT:    #APP
1094; CHECK-NEXT:    nop
1095; CHECK-NEXT:    #NO_APP
1096; CHECK-NEXT:    kmovd %edi, %k1
1097; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1098; CHECK-NEXT:    retq
1099  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1100  %2 = add <8 x i64> %a0, %a1
1101  %3 = bitcast i8 %mask to <8 x i1>
1102  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1103  ret <8 x i64> %4
1104}
1105
1106define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1107; CHECK-LABEL: stack_fold_paddq_maskz_commuted:
1108; CHECK:       # %bb.0:
1109; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1110; CHECK-NEXT:    #APP
1111; CHECK-NEXT:    nop
1112; CHECK-NEXT:    #NO_APP
1113; CHECK-NEXT:    kmovd %edi, %k1
1114; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1115; CHECK-NEXT:    retq
1116  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1117  %2 = add <8 x i64> %a1, %a0
1118  %3 = bitcast i8 %mask to <8 x i1>
1119  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1120  ret <8 x i64> %4
1121}
1122
1123define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) {
1124; CHECK-LABEL: stack_fold_paddsb:
1125; CHECK:       # %bb.0:
1126; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1127; CHECK-NEXT:    #APP
1128; CHECK-NEXT:    nop
1129; CHECK-NEXT:    #NO_APP
1130; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1131; CHECK-NEXT:    retq
1132  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1133  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1134  ret <64 x i8> %2
1135}
1136
1137define <64 x i8> @stack_fold_paddsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
1138; CHECK-LABEL: stack_fold_paddsb_commuted:
1139; CHECK:       # %bb.0:
1140; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1141; CHECK-NEXT:    #APP
1142; CHECK-NEXT:    nop
1143; CHECK-NEXT:    #NO_APP
1144; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1145; CHECK-NEXT:    retq
1146  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1147  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1148  ret <64 x i8> %2
1149}
1150
1151define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
1152; CHECK-LABEL: stack_fold_paddsb_mask:
1153; CHECK:       # %bb.0:
1154; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1155; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1156; CHECK-NEXT:    #APP
1157; CHECK-NEXT:    nop
1158; CHECK-NEXT:    #NO_APP
1159; CHECK-NEXT:    kmovq %rsi, %k1
1160; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1161; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1162; CHECK-NEXT:    retq
1163  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1164  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1165  %3 = bitcast i64 %mask to <64 x i1>
1166  ; load needed to keep the operation from being scheduled about the asm block
1167  %4 = load <64 x i8>, ptr %a2
1168  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1169  ret <64 x i8> %5
1170}
1171
1172define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
1173; CHECK-LABEL: stack_fold_paddsb_mask_commuted:
1174; CHECK:       # %bb.0:
1175; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1176; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1177; CHECK-NEXT:    #APP
1178; CHECK-NEXT:    nop
1179; CHECK-NEXT:    #NO_APP
1180; CHECK-NEXT:    kmovq %rsi, %k1
1181; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1182; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1183; CHECK-NEXT:    retq
1184  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1185  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1186  %3 = bitcast i64 %mask to <64 x i1>
1187  ; load needed to keep the operation from being scheduled about the asm block
1188  %4 = load <64 x i8>, ptr %a2
1189  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1190  ret <64 x i8> %5
1191}
1192
1193define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1194; CHECK-LABEL: stack_fold_paddsb_maskz:
1195; CHECK:       # %bb.0:
1196; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1197; CHECK-NEXT:    #APP
1198; CHECK-NEXT:    nop
1199; CHECK-NEXT:    #NO_APP
1200; CHECK-NEXT:    kmovq %rdi, %k1
1201; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1202; CHECK-NEXT:    retq
1203  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1204  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1205  %3 = bitcast i64 %mask to <64 x i1>
1206  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1207  ret <64 x i8> %4
1208}
1209
1210define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1211; CHECK-LABEL: stack_fold_paddsb_maskz_commuted:
1212; CHECK:       # %bb.0:
1213; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1214; CHECK-NEXT:    #APP
1215; CHECK-NEXT:    nop
1216; CHECK-NEXT:    #NO_APP
1217; CHECK-NEXT:    kmovq %rdi, %k1
1218; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1219; CHECK-NEXT:    retq
1220  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1221  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1222  %3 = bitcast i64 %mask to <64 x i1>
1223  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1224  ret <64 x i8> %4
1225}
1226
1227define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) {
1228; CHECK-LABEL: stack_fold_paddsw:
1229; CHECK:       # %bb.0:
1230; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1231; CHECK-NEXT:    #APP
1232; CHECK-NEXT:    nop
1233; CHECK-NEXT:    #NO_APP
1234; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1235; CHECK-NEXT:    retq
1236  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1237  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1238  ret <32 x i16> %2
1239}
1240
1241define <32 x i16> @stack_fold_paddsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1242; CHECK-LABEL: stack_fold_paddsw_commuted:
1243; CHECK:       # %bb.0:
1244; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1245; CHECK-NEXT:    #APP
1246; CHECK-NEXT:    nop
1247; CHECK-NEXT:    #NO_APP
1248; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1249; CHECK-NEXT:    retq
1250  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1251  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1252  ret <32 x i16> %2
1253}
1254
1255define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1256; CHECK-LABEL: stack_fold_paddsw_mask:
1257; CHECK:       # %bb.0:
1258; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1259; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1260; CHECK-NEXT:    #APP
1261; CHECK-NEXT:    nop
1262; CHECK-NEXT:    #NO_APP
1263; CHECK-NEXT:    kmovd %esi, %k1
1264; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1265; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1266; CHECK-NEXT:    retq
1267  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1268  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1269  %3 = bitcast i32 %mask to <32 x i1>
1270  ; load needed to keep the operation from being scheduled about the asm block
1271  %4 = load <32 x i16>, ptr %a2
1272  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1273  ret <32 x i16> %5
1274}
1275
1276define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1277; CHECK-LABEL: stack_fold_paddsw_mask_commuted:
1278; CHECK:       # %bb.0:
1279; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1280; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1281; CHECK-NEXT:    #APP
1282; CHECK-NEXT:    nop
1283; CHECK-NEXT:    #NO_APP
1284; CHECK-NEXT:    kmovd %esi, %k1
1285; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1286; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1287; CHECK-NEXT:    retq
1288  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1289  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1290  %3 = bitcast i32 %mask to <32 x i1>
1291  ; load needed to keep the operation from being scheduled about the asm block
1292  %4 = load <32 x i16>, ptr %a2
1293  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1294  ret <32 x i16> %5
1295}
1296
1297define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1298; CHECK-LABEL: stack_fold_paddsw_maskz:
1299; CHECK:       # %bb.0:
1300; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1301; CHECK-NEXT:    #APP
1302; CHECK-NEXT:    nop
1303; CHECK-NEXT:    #NO_APP
1304; CHECK-NEXT:    kmovd %edi, %k1
1305; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1306; CHECK-NEXT:    retq
1307  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1308  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1309  %3 = bitcast i32 %mask to <32 x i1>
1310  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1311  ret <32 x i16> %4
1312}
1313
1314define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1315; CHECK-LABEL: stack_fold_paddsw_maskz_commuted:
1316; CHECK:       # %bb.0:
1317; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1318; CHECK-NEXT:    #APP
1319; CHECK-NEXT:    nop
1320; CHECK-NEXT:    #NO_APP
1321; CHECK-NEXT:    kmovd %edi, %k1
1322; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1323; CHECK-NEXT:    retq
1324  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1325  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1326  %3 = bitcast i32 %mask to <32 x i1>
1327  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1328  ret <32 x i16> %4
1329}
1330
1331define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) {
1332; CHECK-LABEL: stack_fold_paddusb:
1333; CHECK:       # %bb.0:
1334; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1335; CHECK-NEXT:    #APP
1336; CHECK-NEXT:    nop
1337; CHECK-NEXT:    #NO_APP
1338; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1339; CHECK-NEXT:    retq
1340  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1341  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1342  ret <64 x i8> %2
1343}
1344
1345define <64 x i8> @stack_fold_paddusb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
1346; CHECK-LABEL: stack_fold_paddusb_commuted:
1347; CHECK:       # %bb.0:
1348; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1349; CHECK-NEXT:    #APP
1350; CHECK-NEXT:    nop
1351; CHECK-NEXT:    #NO_APP
1352; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1353; CHECK-NEXT:    retq
1354  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1355  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1356  ret <64 x i8> %2
1357}
1358
1359define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
1360; CHECK-LABEL: stack_fold_paddusb_mask:
1361; CHECK:       # %bb.0:
1362; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1363; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1364; CHECK-NEXT:    #APP
1365; CHECK-NEXT:    nop
1366; CHECK-NEXT:    #NO_APP
1367; CHECK-NEXT:    kmovq %rsi, %k1
1368; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1369; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1370; CHECK-NEXT:    retq
1371  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1372  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1373  %3 = bitcast i64 %mask to <64 x i1>
1374  ; load needed to keep the operation from being scheduled about the asm block
1375  %4 = load <64 x i8>, ptr %a2
1376  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1377  ret <64 x i8> %5
1378}
1379
1380define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
1381; CHECK-LABEL: stack_fold_paddusb_mask_commuted:
1382; CHECK:       # %bb.0:
1383; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1384; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1385; CHECK-NEXT:    #APP
1386; CHECK-NEXT:    nop
1387; CHECK-NEXT:    #NO_APP
1388; CHECK-NEXT:    kmovq %rsi, %k1
1389; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1390; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1391; CHECK-NEXT:    retq
1392  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1393  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1394  %3 = bitcast i64 %mask to <64 x i1>
1395  ; load needed to keep the operation from being scheduled about the asm block
1396  %4 = load <64 x i8>, ptr %a2
1397  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1398  ret <64 x i8> %5
1399}
1400
1401define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1402; CHECK-LABEL: stack_fold_paddusb_maskz:
1403; CHECK:       # %bb.0:
1404; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1405; CHECK-NEXT:    #APP
1406; CHECK-NEXT:    nop
1407; CHECK-NEXT:    #NO_APP
1408; CHECK-NEXT:    kmovq %rdi, %k1
1409; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1410; CHECK-NEXT:    retq
1411  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1412  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1413  %3 = bitcast i64 %mask to <64 x i1>
1414  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1415  ret <64 x i8> %4
1416}
1417
1418define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1419; CHECK-LABEL: stack_fold_paddusb_maskz_commuted:
1420; CHECK:       # %bb.0:
1421; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1422; CHECK-NEXT:    #APP
1423; CHECK-NEXT:    nop
1424; CHECK-NEXT:    #NO_APP
1425; CHECK-NEXT:    kmovq %rdi, %k1
1426; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1427; CHECK-NEXT:    retq
1428  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1429  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1430  %3 = bitcast i64 %mask to <64 x i1>
1431  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1432  ret <64 x i8> %4
1433}
1434
1435define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) {
1436; CHECK-LABEL: stack_fold_paddusw:
1437; CHECK:       # %bb.0:
1438; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1439; CHECK-NEXT:    #APP
1440; CHECK-NEXT:    nop
1441; CHECK-NEXT:    #NO_APP
1442; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1443; CHECK-NEXT:    retq
1444  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1445  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1446  ret <32 x i16> %2
1447}
1448
1449define <32 x i16> @stack_fold_paddusw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1450; CHECK-LABEL: stack_fold_paddusw_commuted:
1451; CHECK:       # %bb.0:
1452; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1453; CHECK-NEXT:    #APP
1454; CHECK-NEXT:    nop
1455; CHECK-NEXT:    #NO_APP
1456; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1457; CHECK-NEXT:    retq
1458  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1459  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1460  ret <32 x i16> %2
1461}
1462
1463define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1464; CHECK-LABEL: stack_fold_paddusw_mask:
1465; CHECK:       # %bb.0:
1466; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1467; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1468; CHECK-NEXT:    #APP
1469; CHECK-NEXT:    nop
1470; CHECK-NEXT:    #NO_APP
1471; CHECK-NEXT:    kmovd %esi, %k1
1472; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1473; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1474; CHECK-NEXT:    retq
1475  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1476  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1477  %3 = bitcast i32 %mask to <32 x i1>
1478  ; load needed to keep the operation from being scheduled about the asm block
1479  %4 = load <32 x i16>, ptr %a2
1480  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1481  ret <32 x i16> %5
1482}
1483
1484define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1485; CHECK-LABEL: stack_fold_paddusw_mask_commuted:
1486; CHECK:       # %bb.0:
1487; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1488; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1489; CHECK-NEXT:    #APP
1490; CHECK-NEXT:    nop
1491; CHECK-NEXT:    #NO_APP
1492; CHECK-NEXT:    kmovd %esi, %k1
1493; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1494; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1495; CHECK-NEXT:    retq
1496  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1497  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1498  %3 = bitcast i32 %mask to <32 x i1>
1499  ; load needed to keep the operation from being scheduled about the asm block
1500  %4 = load <32 x i16>, ptr %a2
1501  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1502  ret <32 x i16> %5
1503}
1504
1505define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1506; CHECK-LABEL: stack_fold_paddusw_maskz:
1507; CHECK:       # %bb.0:
1508; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1509; CHECK-NEXT:    #APP
1510; CHECK-NEXT:    nop
1511; CHECK-NEXT:    #NO_APP
1512; CHECK-NEXT:    kmovd %edi, %k1
1513; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1514; CHECK-NEXT:    retq
1515  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1516  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1517  %3 = bitcast i32 %mask to <32 x i1>
1518  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1519  ret <32 x i16> %4
1520}
1521
1522define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1523; CHECK-LABEL: stack_fold_paddusw_maskz_commuted:
1524; CHECK:       # %bb.0:
1525; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1526; CHECK-NEXT:    #APP
1527; CHECK-NEXT:    nop
1528; CHECK-NEXT:    #NO_APP
1529; CHECK-NEXT:    kmovd %edi, %k1
1530; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1531; CHECK-NEXT:    retq
1532  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1533  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1534  %3 = bitcast i32 %mask to <32 x i1>
1535  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1536  ret <32 x i16> %4
1537}
1538
1539define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) {
1540; CHECK-LABEL: stack_fold_paddw:
1541; CHECK:       # %bb.0:
1542; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1543; CHECK-NEXT:    #APP
1544; CHECK-NEXT:    nop
1545; CHECK-NEXT:    #NO_APP
1546; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1547; CHECK-NEXT:    retq
1548  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1549  %2 = add <32 x i16> %a0, %a1
1550  ret <32 x i16> %2
1551}
1552
1553define <32 x i16> @stack_fold_paddw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1554; CHECK-LABEL: stack_fold_paddw_commuted:
1555; CHECK:       # %bb.0:
1556; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1557; CHECK-NEXT:    #APP
1558; CHECK-NEXT:    nop
1559; CHECK-NEXT:    #NO_APP
1560; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1561; CHECK-NEXT:    retq
1562  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1563  %2 = add <32 x i16> %a1, %a0
1564  ret <32 x i16> %2
1565}
1566
1567define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1568; CHECK-LABEL: stack_fold_paddw_mask:
1569; CHECK:       # %bb.0:
1570; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1571; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1572; CHECK-NEXT:    #APP
1573; CHECK-NEXT:    nop
1574; CHECK-NEXT:    #NO_APP
1575; CHECK-NEXT:    kmovd %esi, %k1
1576; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1577; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1578; CHECK-NEXT:    retq
1579  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1580  %2 = add <32 x i16> %a0, %a1
1581  %3 = bitcast i32 %mask to <32 x i1>
1582  ; load needed to keep the operation from being scheduled about the asm block
1583  %4 = load <32 x i16>, ptr %a2
1584  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1585  ret <32 x i16> %5
1586}
1587
1588define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1589; CHECK-LABEL: stack_fold_paddw_mask_commuted:
1590; CHECK:       # %bb.0:
1591; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1592; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1593; CHECK-NEXT:    #APP
1594; CHECK-NEXT:    nop
1595; CHECK-NEXT:    #NO_APP
1596; CHECK-NEXT:    kmovd %esi, %k1
1597; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1598; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1599; CHECK-NEXT:    retq
1600  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1601  %2 = add <32 x i16> %a1, %a0
1602  %3 = bitcast i32 %mask to <32 x i1>
1603  ; load needed to keep the operation from being scheduled about the asm block
1604  %4 = load <32 x i16>, ptr %a2
1605  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1606  ret <32 x i16> %5
1607}
1608
1609define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1610; CHECK-LABEL: stack_fold_paddw_maskz:
1611; CHECK:       # %bb.0:
1612; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1613; CHECK-NEXT:    #APP
1614; CHECK-NEXT:    nop
1615; CHECK-NEXT:    #NO_APP
1616; CHECK-NEXT:    kmovd %edi, %k1
1617; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1618; CHECK-NEXT:    retq
1619  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1620  %2 = add <32 x i16> %a0, %a1
1621  %3 = bitcast i32 %mask to <32 x i1>
1622  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1623  ret <32 x i16> %4
1624}
1625
1626define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1627; CHECK-LABEL: stack_fold_paddw_maskz_commuted:
1628; CHECK:       # %bb.0:
1629; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1630; CHECK-NEXT:    #APP
1631; CHECK-NEXT:    nop
1632; CHECK-NEXT:    #NO_APP
1633; CHECK-NEXT:    kmovd %edi, %k1
1634; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1635; CHECK-NEXT:    retq
1636  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1637  %2 = add <32 x i16> %a1, %a0
1638  %3 = bitcast i32 %mask to <32 x i1>
1639  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1640  ret <32 x i16> %4
1641}
1642
1643define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) {
1644; CHECK-LABEL: stack_fold_palignr:
1645; CHECK:       # %bb.0:
1646; CHECK-NEXT:    pushq %rax
1647; CHECK-NEXT:    .cfi_def_cfa_offset 16
1648; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1649; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1650; CHECK-NEXT:    #APP
1651; CHECK-NEXT:    nop
1652; CHECK-NEXT:    #NO_APP
1653; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1654; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1655; CHECK-NEXT:    # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1656; CHECK-NEXT:    popq %rax
1657; CHECK-NEXT:    .cfi_def_cfa_offset 8
1658; CHECK-NEXT:    retq
1659  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1660  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1661  ret <64 x i8> %2
1662}
1663
1664define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %passthru, i64 %mask) {
1665; CHECK-LABEL: stack_fold_palignr_mask:
1666; CHECK:       # %bb.0:
1667; CHECK-NEXT:    pushq %rax
1668; CHECK-NEXT:    .cfi_def_cfa_offset 16
1669; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1670; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1671; CHECK-NEXT:    #APP
1672; CHECK-NEXT:    nop
1673; CHECK-NEXT:    #NO_APP
1674; CHECK-NEXT:    kmovq %rsi, %k1
1675; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
1676; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1677; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1678; CHECK-NEXT:    # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1679; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1680; CHECK-NEXT:    popq %rax
1681; CHECK-NEXT:    .cfi_def_cfa_offset 8
1682; CHECK-NEXT:    retq
1683  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1684  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1685  %3 = bitcast i64 %mask to <64 x i1>
1686  %4 = load <64 x i8>, ptr %passthru
1687  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1688  ret <64 x i8> %5
1689}
1690
1691define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1692; CHECK-LABEL: stack_fold_palignr_maskz:
1693; CHECK:       # %bb.0:
1694; CHECK-NEXT:    pushq %rax
1695; CHECK-NEXT:    .cfi_def_cfa_offset 16
1696; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1697; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1698; CHECK-NEXT:    #APP
1699; CHECK-NEXT:    nop
1700; CHECK-NEXT:    #NO_APP
1701; CHECK-NEXT:    kmovq %rdi, %k1
1702; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1703; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1704; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1705; CHECK-NEXT:    popq %rax
1706; CHECK-NEXT:    .cfi_def_cfa_offset 8
1707; CHECK-NEXT:    retq
1708  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1709  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1710  %3 = bitcast i64 %mask to <64 x i1>
1711  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1712  ret <64 x i8> %4
1713}
1714
1715define <16 x i32> @stack_fold_pandd(<16 x i32> %a0, <16 x i32> %a1) {
1716; CHECK-LABEL: stack_fold_pandd:
1717; CHECK:       # %bb.0:
1718; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1719; CHECK-NEXT:    #APP
1720; CHECK-NEXT:    nop
1721; CHECK-NEXT:    #NO_APP
1722; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1723; CHECK-NEXT:    retq
1724  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1725  %2 = and <16 x i32> %a0, %a1
1726  ret <16 x i32> %2
1727}
1728
1729define <16 x i32> @stack_fold_pandd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
1730; CHECK-LABEL: stack_fold_pandd_commuted:
1731; CHECK:       # %bb.0:
1732; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1733; CHECK-NEXT:    #APP
1734; CHECK-NEXT:    nop
1735; CHECK-NEXT:    #NO_APP
1736; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1737; CHECK-NEXT:    retq
1738  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1739  %2 = and <16 x i32> %a1, %a0
1740  ret <16 x i32> %2
1741}
1742
1743define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
1744; CHECK-LABEL: stack_fold_pandd_mask:
1745; CHECK:       # %bb.0:
1746; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1747; CHECK-NEXT:    vmovaps %zmm0, %zmm1
1748; CHECK-NEXT:    #APP
1749; CHECK-NEXT:    nop
1750; CHECK-NEXT:    #NO_APP
1751; CHECK-NEXT:    kmovd %esi, %k1
1752; CHECK-NEXT:    vmovaps (%rdi), %zmm0
1753; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1754; CHECK-NEXT:    retq
1755  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1756  %2 = and <16 x i32> %a0, %a1
1757  %3 = bitcast i16 %mask to <16 x i1>
1758  ; load needed to keep the operation from being scheduled about the asm block
1759  %4 = load <16 x i32>, ptr %a2
1760  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
1761  ret <16 x i32> %5
1762}
1763
1764define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
1765; CHECK-LABEL: stack_fold_pandd_mask_commuted:
1766; CHECK:       # %bb.0:
1767; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1768; CHECK-NEXT:    vmovaps %zmm0, %zmm1
1769; CHECK-NEXT:    #APP
1770; CHECK-NEXT:    nop
1771; CHECK-NEXT:    #NO_APP
1772; CHECK-NEXT:    kmovd %esi, %k1
1773; CHECK-NEXT:    vmovaps (%rdi), %zmm0
1774; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1775; CHECK-NEXT:    retq
1776  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1777  %2 = and <16 x i32> %a1, %a0
1778  %3 = bitcast i16 %mask to <16 x i1>
1779  ; load needed to keep the operation from being scheduled about the asm block
1780  %4 = load <16 x i32>, ptr %a2
1781  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
1782  ret <16 x i32> %5
1783}
1784
1785define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1786; CHECK-LABEL: stack_fold_pandd_maskz:
1787; CHECK:       # %bb.0:
1788; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1789; CHECK-NEXT:    #APP
1790; CHECK-NEXT:    nop
1791; CHECK-NEXT:    #NO_APP
1792; CHECK-NEXT:    kmovd %edi, %k1
1793; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1794; CHECK-NEXT:    retq
1795  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1796  %2 = and <16 x i32> %a0, %a1
1797  %3 = bitcast i16 %mask to <16 x i1>
1798  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1799  ret <16 x i32> %4
1800}
1801
1802define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1803; CHECK-LABEL: stack_fold_pandd_maskz_commuted:
1804; CHECK:       # %bb.0:
1805; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1806; CHECK-NEXT:    #APP
1807; CHECK-NEXT:    nop
1808; CHECK-NEXT:    #NO_APP
1809; CHECK-NEXT:    kmovd %edi, %k1
1810; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1811; CHECK-NEXT:    retq
1812  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1813  %2 = and <16 x i32> %a1, %a0
1814  %3 = bitcast i16 %mask to <16 x i1>
1815  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1816  ret <16 x i32> %4
1817}
1818
1819define <8 x i64> @stack_fold_pandq(<8 x i64> %a0, <8 x i64> %a1) {
1820; CHECK-LABEL: stack_fold_pandq:
1821; CHECK:       # %bb.0:
1822; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1823; CHECK-NEXT:    #APP
1824; CHECK-NEXT:    nop
1825; CHECK-NEXT:    #NO_APP
1826; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1827; CHECK-NEXT:    retq
1828  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1829  %2 = and <8 x i64> %a0, %a1
1830  ret <8 x i64> %2
1831}
1832
1833define <8 x i64> @stack_fold_pandq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
1834; CHECK-LABEL: stack_fold_pandq_commuted:
1835; CHECK:       # %bb.0:
1836; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1837; CHECK-NEXT:    #APP
1838; CHECK-NEXT:    nop
1839; CHECK-NEXT:    #NO_APP
1840; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1841; CHECK-NEXT:    retq
1842  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1843  %2 = and <8 x i64> %a1, %a0
1844  ret <8 x i64> %2
1845}
1846
1847define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
1848; CHECK-LABEL: stack_fold_pandq_mask:
1849; CHECK:       # %bb.0:
1850; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1851; CHECK-NEXT:    vmovapd %zmm0, %zmm1
1852; CHECK-NEXT:    #APP
1853; CHECK-NEXT:    nop
1854; CHECK-NEXT:    #NO_APP
1855; CHECK-NEXT:    kmovd %esi, %k1
1856; CHECK-NEXT:    vmovapd (%rdi), %zmm0
1857; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1858; CHECK-NEXT:    retq
1859  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1860  %2 = and <8 x i64> %a0, %a1
1861  %3 = bitcast i8 %mask to <8 x i1>
1862  ; load needed to keep the operation from being scheduled about the asm block
1863  %4 = load <8 x i64>, ptr %a2
1864  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1865  ret <8 x i64> %5
1866}
1867
1868define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
1869; CHECK-LABEL: stack_fold_pandq_mask_commuted:
1870; CHECK:       # %bb.0:
1871; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1872; CHECK-NEXT:    vmovapd %zmm0, %zmm1
1873; CHECK-NEXT:    #APP
1874; CHECK-NEXT:    nop
1875; CHECK-NEXT:    #NO_APP
1876; CHECK-NEXT:    kmovd %esi, %k1
1877; CHECK-NEXT:    vmovapd (%rdi), %zmm0
1878; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1879; CHECK-NEXT:    retq
1880  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1881  %2 = and <8 x i64> %a1, %a0
1882  %3 = bitcast i8 %mask to <8 x i1>
1883  ; load needed to keep the operation from being scheduled about the asm block
1884  %4 = load <8 x i64>, ptr %a2
1885  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1886  ret <8 x i64> %5
1887}
1888
1889define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1890; CHECK-LABEL: stack_fold_pandq_maskz:
1891; CHECK:       # %bb.0:
1892; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1893; CHECK-NEXT:    #APP
1894; CHECK-NEXT:    nop
1895; CHECK-NEXT:    #NO_APP
1896; CHECK-NEXT:    kmovd %edi, %k1
1897; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1898; CHECK-NEXT:    retq
1899  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1900  %2 = and <8 x i64> %a0, %a1
1901  %3 = bitcast i8 %mask to <8 x i1>
1902  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1903  ret <8 x i64> %4
1904}
1905
1906define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1907; CHECK-LABEL: stack_fold_pandq_maskz_commuted:
1908; CHECK:       # %bb.0:
1909; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1910; CHECK-NEXT:    #APP
1911; CHECK-NEXT:    nop
1912; CHECK-NEXT:    #NO_APP
1913; CHECK-NEXT:    kmovd %edi, %k1
1914; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1915; CHECK-NEXT:    retq
1916  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1917  %2 = and <8 x i64> %a1, %a0
1918  %3 = bitcast i8 %mask to <8 x i1>
1919  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1920  ret <8 x i64> %4
1921}
1922
1923define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) {
1924; CHECK-LABEL: stack_fold_vpconflictd:
1925; CHECK:       # %bb.0:
1926; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1927; CHECK-NEXT:    #APP
1928; CHECK-NEXT:    nop
1929; CHECK-NEXT:    #NO_APP
1930; CHECK-NEXT:    vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1931; CHECK-NEXT:    retq
1932  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1933  %2 = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a0)
1934  ret <16 x i32> %2
1935}
1936declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
1937
1938define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) {
1939; CHECK-LABEL: stack_fold_vpconflictq:
1940; CHECK:       # %bb.0:
1941; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1942; CHECK-NEXT:    #APP
1943; CHECK-NEXT:    nop
1944; CHECK-NEXT:    #NO_APP
1945; CHECK-NEXT:    vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1946; CHECK-NEXT:    retq
1947  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1948  %2 = call <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %a0)
1949  ret <8 x i64> %2
1950}
1951declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone
1952
1953define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) {
1954; CHECK-LABEL: stack_fold_pcmpeqb:
1955; CHECK:       # %bb.0:
1956; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1957; CHECK-NEXT:    #APP
1958; CHECK-NEXT:    nop
1959; CHECK-NEXT:    #NO_APP
1960; CHECK-NEXT:    vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1961; CHECK-NEXT:    kmovq %k0, %rax
1962; CHECK-NEXT:    vzeroupper
1963; CHECK-NEXT:    retq
1964  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1965  %2 = icmp eq <64 x i8> %a0, %a1
1966  %3 = bitcast <64 x i1> %2 to i64
1967  ret i64 %3
1968}
1969
1970define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) {
1971; CHECK-LABEL: stack_fold_pcmpeqd:
1972; CHECK:       # %bb.0:
1973; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1974; CHECK-NEXT:    #APP
1975; CHECK-NEXT:    nop
1976; CHECK-NEXT:    #NO_APP
1977; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1978; CHECK-NEXT:    kmovd %k0, %eax
1979; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1980; CHECK-NEXT:    vzeroupper
1981; CHECK-NEXT:    retq
1982  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1983  %2 = icmp eq <16 x i32> %a0, %a1
1984  %3 = bitcast <16 x i1> %2 to i16
1985  ret i16 %3
1986}
1987
1988define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) {
1989; CHECK-LABEL: stack_fold_pcmpeqq:
1990; CHECK:       # %bb.0:
1991; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1992; CHECK-NEXT:    #APP
1993; CHECK-NEXT:    nop
1994; CHECK-NEXT:    #NO_APP
1995; CHECK-NEXT:    vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1996; CHECK-NEXT:    kmovd %k0, %eax
1997; CHECK-NEXT:    # kill: def $al killed $al killed $eax
1998; CHECK-NEXT:    vzeroupper
1999; CHECK-NEXT:    retq
2000  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2001  %2 = icmp eq <8 x i64> %a0, %a1
2002  %3 = bitcast <8 x i1> %2 to i8
2003  ret i8 %3
2004}
2005
2006define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) {
2007; CHECK-LABEL: stack_fold_pcmpeqw:
2008; CHECK:       # %bb.0:
2009; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2010; CHECK-NEXT:    #APP
2011; CHECK-NEXT:    nop
2012; CHECK-NEXT:    #NO_APP
2013; CHECK-NEXT:    vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
2014; CHECK-NEXT:    kmovd %k0, %eax
2015; CHECK-NEXT:    vzeroupper
2016; CHECK-NEXT:    retq
2017  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2018  %2 = icmp eq <32 x i16> %a0, %a1
2019  %3 = bitcast <32 x i1> %2 to i32
2020  ret i32 %3
2021}
2022
2023define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2024; CHECK-LABEL: stack_fold_pcmpeqd_mask:
2025; CHECK:       # %bb.0:
2026; CHECK-NEXT:    subq $136, %rsp
2027; CHECK-NEXT:    .cfi_def_cfa_offset 144
2028; CHECK-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2029; CHECK-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
2030; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2031; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2032; CHECK-NEXT:    #APP
2033; CHECK-NEXT:    nop
2034; CHECK-NEXT:    #NO_APP
2035; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2036; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
2037; CHECK-NEXT:    kmovd %esi, %k1
2038; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2039; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2040; CHECK-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
2041; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
2042; CHECK-NEXT:    addq $136, %rsp
2043; CHECK-NEXT:    .cfi_def_cfa_offset 8
2044; CHECK-NEXT:    retq
2045  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2046  ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2047  %2 = load <16 x i32>, ptr %a2
2048  %3 = add <16 x i32> %a1, %2
2049  %4 = bitcast i16 %mask to <16 x i1>
2050  %5 = icmp eq <16 x i32> %3, %a0
2051  %6 = and <16 x i1> %4, %5
2052  %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2053  ret <16 x i32> %7
2054}
2055
2056define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2057; CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted:
2058; CHECK:       # %bb.0:
2059; CHECK-NEXT:    subq $136, %rsp
2060; CHECK-NEXT:    .cfi_def_cfa_offset 144
2061; CHECK-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2062; CHECK-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
2063; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2064; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2065; CHECK-NEXT:    #APP
2066; CHECK-NEXT:    nop
2067; CHECK-NEXT:    #NO_APP
2068; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2069; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
2070; CHECK-NEXT:    kmovd %esi, %k1
2071; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2072; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2073; CHECK-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
2074; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
2075; CHECK-NEXT:    addq $136, %rsp
2076; CHECK-NEXT:    .cfi_def_cfa_offset 8
2077; CHECK-NEXT:    retq
2078  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2079  ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2080  %2 = load <16 x i32>, ptr %a2
2081  %3 = add <16 x i32> %a1, %2
2082  %4 = bitcast i16 %mask to <16 x i1>
2083  %5 = icmp eq <16 x i32> %a0, %3
2084  %6 = and <16 x i1> %4, %5
2085  %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2086  ret <16 x i32> %7
2087}
2088
2089define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2090; CHECK-LABEL: stack_fold_pcmpled_mask:
2091; CHECK:       # %bb.0:
2092; CHECK-NEXT:    subq $136, %rsp
2093; CHECK-NEXT:    .cfi_def_cfa_offset 144
2094; CHECK-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2095; CHECK-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
2096; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2097; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2098; CHECK-NEXT:    #APP
2099; CHECK-NEXT:    nop
2100; CHECK-NEXT:    #NO_APP
2101; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2102; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
2103; CHECK-NEXT:    kmovd %esi, %k1
2104; CHECK-NEXT:    vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2105; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2106; CHECK-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
2107; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
2108; CHECK-NEXT:    addq $136, %rsp
2109; CHECK-NEXT:    .cfi_def_cfa_offset 8
2110; CHECK-NEXT:    retq
2111  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2112  ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2113  %2 = load <16 x i32>, ptr %a2
2114  %3 = add <16 x i32> %a1, %2
2115  %4 = bitcast i16 %mask to <16 x i1>
2116  %5 = icmp sge <16 x i32> %a0, %3
2117  %6 = and <16 x i1> %4, %5
2118  %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2119  ret <16 x i32> %7
2120}
2121
2122define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
2123; CHECK-LABEL: stack_fold_pcmpleud:
2124; CHECK:       # %bb.0:
2125; CHECK-NEXT:    pushq %rax
2126; CHECK-NEXT:    .cfi_def_cfa_offset 16
2127; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2128; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2129; CHECK-NEXT:    #APP
2130; CHECK-NEXT:    nop
2131; CHECK-NEXT:    #NO_APP
2132; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2133; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
2134; CHECK-NEXT:    vpcmpleud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
2135; CHECK-NEXT:    kmovd %k0, %eax
2136; CHECK-NEXT:    andl %esi, %eax
2137; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
2138; CHECK-NEXT:    popq %rcx
2139; CHECK-NEXT:    .cfi_def_cfa_offset 8
2140; CHECK-NEXT:    vzeroupper
2141; CHECK-NEXT:    retq
2142  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2143  %2 = load <16 x i32>, ptr %a2
2144  %3 = add <16 x i32> %a1, %2
2145  %4 = bitcast i16 %mask to <16 x i1>
2146  %5 = icmp uge <16 x i32> %a0, %3
2147  %6 = and <16 x i1> %5, %4
2148  %7 = bitcast <16 x i1> %6 to i16
2149  ret i16 %7
2150}
2151
2152define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
2153; CHECK-LABEL: stack_fold_permbvar:
2154; CHECK:       # %bb.0:
2155; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2156; CHECK-NEXT:    #APP
2157; CHECK-NEXT:    nop
2158; CHECK-NEXT:    #NO_APP
2159; CHECK-NEXT:    vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2160; CHECK-NEXT:    retq
2161  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2162  %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2163  ret <64 x i8> %2
2164}
2165declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) nounwind readonly
2166
2167define <64 x i8> @stack_fold_permbvar_mask(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
2168; CHECK-LABEL: stack_fold_permbvar_mask:
2169; CHECK:       # %bb.0:
2170; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2171; CHECK-NEXT:    #APP
2172; CHECK-NEXT:    nop
2173; CHECK-NEXT:    #NO_APP
2174; CHECK-NEXT:    kmovq %rsi, %k1
2175; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2176; CHECK-NEXT:    vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2177; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2178; CHECK-NEXT:    retq
2179  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2180  %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2181  %3 = bitcast i64 %mask to <64 x i1>
2182  ; load needed to keep the operation from being scheduled above the asm block
2183  %4 = load <64 x i8>, ptr %passthru
2184  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
2185  ret <64 x i8> %5
2186}
2187
2188define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
2189; CHECK-LABEL: stack_fold_permbvar_maskz:
2190; CHECK:       # %bb.0:
2191; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2192; CHECK-NEXT:    #APP
2193; CHECK-NEXT:    nop
2194; CHECK-NEXT:    #NO_APP
2195; CHECK-NEXT:    kmovq %rdi, %k1
2196; CHECK-NEXT:    vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2197; CHECK-NEXT:    retq
2198  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2199  %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2200  %3 = bitcast i64 %mask to <64 x i1>
2201  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
2202  ret <64 x i8> %4
2203}
2204
2205define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
2206; CHECK-LABEL: stack_fold_permd:
2207; CHECK:       # %bb.0:
2208; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2209; CHECK-NEXT:    #APP
2210; CHECK-NEXT:    nop
2211; CHECK-NEXT:    #NO_APP
2212; CHECK-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2213; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
2214; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
2215; CHECK-NEXT:    retq
2216  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2217  %2 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0)
2218  ; add forces execution domain
2219  %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2220  ret <16 x i32> %3
2221}
2222declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) nounwind readonly
2223
2224define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
2225; CHECK-LABEL: stack_fold_vpermi2b:
2226; CHECK:       # %bb.0:
2227; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2228; CHECK-NEXT:    #APP
2229; CHECK-NEXT:    nop
2230; CHECK-NEXT:    #NO_APP
2231; CHECK-NEXT:    vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2232; CHECK-NEXT:    retq
2233  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2234  %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2)
2235  ret <64 x i8> %2
2236}
2237
2238define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
2239; CHECK-LABEL: stack_fold_vpermi2d:
2240; CHECK:       # %bb.0:
2241; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2242; CHECK-NEXT:    #APP
2243; CHECK-NEXT:    nop
2244; CHECK-NEXT:    #NO_APP
2245; CHECK-NEXT:    vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2246; CHECK-NEXT:    retq
2247  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2248  %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
2249  ret <16 x i32> %2
2250}
2251
2252define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
2253; CHECK-LABEL: stack_fold_vpermi2q:
2254; CHECK:       # %bb.0:
2255; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2256; CHECK-NEXT:    #APP
2257; CHECK-NEXT:    nop
2258; CHECK-NEXT:    #NO_APP
2259; CHECK-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2260; CHECK-NEXT:    retq
2261  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2262  %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
2263  ret <8 x i64> %2
2264}
2265
2266define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
2267; CHECK-LABEL: stack_fold_vpermi2w:
2268; CHECK:       # %bb.0:
2269; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2270; CHECK-NEXT:    #APP
2271; CHECK-NEXT:    nop
2272; CHECK-NEXT:    #NO_APP
2273; CHECK-NEXT:    vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2274; CHECK-NEXT:    retq
2275  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2276  %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2)
2277  ret <32 x i16> %2
2278}
2279
2280define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
2281; CHECK-LABEL: stack_fold_permq:
2282; CHECK:       # %bb.0:
2283; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2284; CHECK-NEXT:    #APP
2285; CHECK-NEXT:    nop
2286; CHECK-NEXT:    #NO_APP
2287; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2288; CHECK-NEXT:    # zmm0 = mem[3,2,2,3,7,6,6,7]
2289; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
2290; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
2291; CHECK-NEXT:    retq
2292  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2293  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2294  ; add forces execution domain
2295  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2296  ret <8 x i64> %3
2297}
2298
2299define <8 x i64> @stack_fold_permq_mask(ptr %passthru, <8 x i64> %a0, i8 %mask) {
2300; CHECK-LABEL: stack_fold_permq_mask:
2301; CHECK:       # %bb.0:
2302; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2303; CHECK-NEXT:    #APP
2304; CHECK-NEXT:    nop
2305; CHECK-NEXT:    #NO_APP
2306; CHECK-NEXT:    kmovd %esi, %k1
2307; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
2308; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
2309; CHECK-NEXT:    # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7]
2310; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
2311; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
2312; CHECK-NEXT:    retq
2313  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2314  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2315  %3 = bitcast i8 %mask to <8 x i1>
2316  ; load needed to keep the operation from being scheduled above the asm block
2317  %4 = load <8 x i64>, ptr %passthru
2318  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
2319  ; add forces execution domain
2320  %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2321  ret <8 x i64> %6
2322}
2323
2324define <8 x i64> @stack_fold_permq_maskz(ptr %passthru, <8 x i64> %a0, i8 %mask) {
2325; CHECK-LABEL: stack_fold_permq_maskz:
2326; CHECK:       # %bb.0:
2327; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2328; CHECK-NEXT:    #APP
2329; CHECK-NEXT:    nop
2330; CHECK-NEXT:    #NO_APP
2331; CHECK-NEXT:    kmovd %esi, %k1
2332; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
2333; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7]
2334; CHECK-NEXT:    retq
2335  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2336  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2337  %3 = bitcast i8 %mask to <8 x i1>
2338  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
2339  ret <8 x i64> %4
2340}
2341
2342define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
2343; CHECK-LABEL: stack_fold_permqvar:
2344; CHECK:       # %bb.0:
2345; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2346; CHECK-NEXT:    #APP
2347; CHECK-NEXT:    nop
2348; CHECK-NEXT:    #NO_APP
2349; CHECK-NEXT:    vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2350; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
2351; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
2352; CHECK-NEXT:    retq
2353  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2354  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
2355  ; add forces execution domain
2356  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2357  ret <8 x i64> %3
2358}
2359declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) nounwind readonly
2360
2361define <8 x i64> @stack_fold_permqvar_mask(ptr %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2362; CHECK-LABEL: stack_fold_permqvar_mask:
2363; CHECK:       # %bb.0:
2364; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2365; CHECK-NEXT:    #APP
2366; CHECK-NEXT:    nop
2367; CHECK-NEXT:    #NO_APP
2368; CHECK-NEXT:    kmovd %esi, %k1
2369; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
2370; CHECK-NEXT:    vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
2371; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
2372; CHECK-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
2373; CHECK-NEXT:    retq
2374  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2375  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
2376  %3 = bitcast i8 %mask to <8 x i1>
2377  ; load needed to keep the operation from being scheduled above the asm block
2378  %4 = load <8 x i64>, ptr %passthru
2379  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
2380  ; add forces execution domain
2381  %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2382  ret <8 x i64> %6
2383}
2384
2385define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
2386; CHECK-LABEL: stack_fold_vpermt2b:
2387; CHECK:       # %bb.0:
2388; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2389; CHECK-NEXT:    #APP
2390; CHECK-NEXT:    nop
2391; CHECK-NEXT:    #NO_APP
2392; CHECK-NEXT:    vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2393; CHECK-NEXT:    retq
2394  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2395  %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2)
2396  ret <64 x i8> %2
2397}
2398declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>)
2399
2400define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
2401; CHECK-LABEL: stack_fold_vpermt2d:
2402; CHECK:       # %bb.0:
2403; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2404; CHECK-NEXT:    #APP
2405; CHECK-NEXT:    nop
2406; CHECK-NEXT:    #NO_APP
2407; CHECK-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2408; CHECK-NEXT:    retq
2409  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2410  %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
2411  ret <16 x i32> %2
2412}
2413declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2414
2415define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
2416; CHECK-LABEL: stack_fold_vpermt2q:
2417; CHECK:       # %bb.0:
2418; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2419; CHECK-NEXT:    #APP
2420; CHECK-NEXT:    nop
2421; CHECK-NEXT:    #NO_APP
2422; CHECK-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2423; CHECK-NEXT:    retq
2424  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2425  %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
2426  ret <8 x i64> %2
2427}
2428declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2429
2430define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
2431; CHECK-LABEL: stack_fold_vpermt2w:
2432; CHECK:       # %bb.0:
2433; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2434; CHECK-NEXT:    #APP
2435; CHECK-NEXT:    nop
2436; CHECK-NEXT:    #NO_APP
2437; CHECK-NEXT:    vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2438; CHECK-NEXT:    retq
2439  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2440  %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2)
2441  ret <32 x i16> %2
2442}
2443declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>)
2444
2445define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) {
2446; CHECK-LABEL: stack_fold_permwvar:
2447; CHECK:       # %bb.0:
2448; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2449; CHECK-NEXT:    #APP
2450; CHECK-NEXT:    nop
2451; CHECK-NEXT:    #NO_APP
2452; CHECK-NEXT:    vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2453; CHECK-NEXT:    retq
2454  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2455  %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2456  ret <32 x i16> %2
2457}
2458declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) nounwind readonly
2459
2460define <32 x i16> @stack_fold_permwvar_mask(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
2461; CHECK-LABEL: stack_fold_permwvar_mask:
2462; CHECK:       # %bb.0:
2463; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2464; CHECK-NEXT:    #APP
2465; CHECK-NEXT:    nop
2466; CHECK-NEXT:    #NO_APP
2467; CHECK-NEXT:    kmovd %esi, %k1
2468; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2469; CHECK-NEXT:    vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2470; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2471; CHECK-NEXT:    retq
2472  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2473  %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2474  %3 = bitcast i32 %mask to <32 x i1>
2475  ; load needed to keep the operation from being scheduled above the asm block
2476  %4 = load <32 x i16>, ptr %passthru
2477  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
2478  ret <32 x i16> %5
2479}
2480
2481define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
2482; CHECK-LABEL: stack_fold_permwvar_maskz:
2483; CHECK:       # %bb.0:
2484; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2485; CHECK-NEXT:    #APP
2486; CHECK-NEXT:    nop
2487; CHECK-NEXT:    #NO_APP
2488; CHECK-NEXT:    kmovd %edi, %k1
2489; CHECK-NEXT:    vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2490; CHECK-NEXT:    retq
2491  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2492  %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2493  %3 = bitcast i32 %mask to <32 x i1>
2494  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
2495  ret <32 x i16> %4
2496}
2497
2498define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
2499; CHECK-LABEL: stack_fold_pextrd:
2500; CHECK:       # %bb.0:
2501; CHECK-NEXT:    pushq %rbp
2502; CHECK-NEXT:    .cfi_def_cfa_offset 16
2503; CHECK-NEXT:    pushq %r15
2504; CHECK-NEXT:    .cfi_def_cfa_offset 24
2505; CHECK-NEXT:    pushq %r14
2506; CHECK-NEXT:    .cfi_def_cfa_offset 32
2507; CHECK-NEXT:    pushq %r13
2508; CHECK-NEXT:    .cfi_def_cfa_offset 40
2509; CHECK-NEXT:    pushq %r12
2510; CHECK-NEXT:    .cfi_def_cfa_offset 48
2511; CHECK-NEXT:    pushq %rbx
2512; CHECK-NEXT:    .cfi_def_cfa_offset 56
2513; CHECK-NEXT:    .cfi_offset %rbx, -56
2514; CHECK-NEXT:    .cfi_offset %r12, -48
2515; CHECK-NEXT:    .cfi_offset %r13, -40
2516; CHECK-NEXT:    .cfi_offset %r14, -32
2517; CHECK-NEXT:    .cfi_offset %r15, -24
2518; CHECK-NEXT:    .cfi_offset %rbp, -16
2519; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2520; CHECK-NEXT:    vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
2521; CHECK-NEXT:    #APP
2522; CHECK-NEXT:    nop
2523; CHECK-NEXT:    #NO_APP
2524; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2525; CHECK-NEXT:    popq %rbx
2526; CHECK-NEXT:    .cfi_def_cfa_offset 48
2527; CHECK-NEXT:    popq %r12
2528; CHECK-NEXT:    .cfi_def_cfa_offset 40
2529; CHECK-NEXT:    popq %r13
2530; CHECK-NEXT:    .cfi_def_cfa_offset 32
2531; CHECK-NEXT:    popq %r14
2532; CHECK-NEXT:    .cfi_def_cfa_offset 24
2533; CHECK-NEXT:    popq %r15
2534; CHECK-NEXT:    .cfi_def_cfa_offset 16
2535; CHECK-NEXT:    popq %rbp
2536; CHECK-NEXT:    .cfi_def_cfa_offset 8
2537; CHECK-NEXT:    retq
2538  ; add forces execution domain
2539  %1 = add <4 x i32> %a0, %a1
2540  %2 = extractelement <4 x i32> %1, i32 1
2541  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2542  ret i32 %2
2543}
2544
2545define i64 @stack_fold_pextrq(<2 x i64> %a0) {
2546; CHECK-LABEL: stack_fold_pextrq:
2547; CHECK:       # %bb.0:
2548; CHECK-NEXT:    pushq %rbp
2549; CHECK-NEXT:    .cfi_def_cfa_offset 16
2550; CHECK-NEXT:    pushq %r15
2551; CHECK-NEXT:    .cfi_def_cfa_offset 24
2552; CHECK-NEXT:    pushq %r14
2553; CHECK-NEXT:    .cfi_def_cfa_offset 32
2554; CHECK-NEXT:    pushq %r13
2555; CHECK-NEXT:    .cfi_def_cfa_offset 40
2556; CHECK-NEXT:    pushq %r12
2557; CHECK-NEXT:    .cfi_def_cfa_offset 48
2558; CHECK-NEXT:    pushq %rbx
2559; CHECK-NEXT:    .cfi_def_cfa_offset 56
2560; CHECK-NEXT:    .cfi_offset %rbx, -56
2561; CHECK-NEXT:    .cfi_offset %r12, -48
2562; CHECK-NEXT:    .cfi_offset %r13, -40
2563; CHECK-NEXT:    .cfi_offset %r14, -32
2564; CHECK-NEXT:    .cfi_offset %r15, -24
2565; CHECK-NEXT:    .cfi_offset %rbp, -16
2566; CHECK-NEXT:    vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2567; CHECK-NEXT:    #APP
2568; CHECK-NEXT:    nop
2569; CHECK-NEXT:    #NO_APP
2570; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2571; CHECK-NEXT:    popq %rbx
2572; CHECK-NEXT:    .cfi_def_cfa_offset 48
2573; CHECK-NEXT:    popq %r12
2574; CHECK-NEXT:    .cfi_def_cfa_offset 40
2575; CHECK-NEXT:    popq %r13
2576; CHECK-NEXT:    .cfi_def_cfa_offset 32
2577; CHECK-NEXT:    popq %r14
2578; CHECK-NEXT:    .cfi_def_cfa_offset 24
2579; CHECK-NEXT:    popq %r15
2580; CHECK-NEXT:    .cfi_def_cfa_offset 16
2581; CHECK-NEXT:    popq %rbp
2582; CHECK-NEXT:    .cfi_def_cfa_offset 8
2583; CHECK-NEXT:    retq
2584  %1 = extractelement <2 x i64> %a0, i32 1
2585  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2586  ret i64 %1
2587}
2588
2589define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
2590; CHECK-LABEL: stack_fold_pinsrb:
2591; CHECK:       # %bb.0:
2592; CHECK-NEXT:    pushq %rbp
2593; CHECK-NEXT:    .cfi_def_cfa_offset 16
2594; CHECK-NEXT:    pushq %r15
2595; CHECK-NEXT:    .cfi_def_cfa_offset 24
2596; CHECK-NEXT:    pushq %r14
2597; CHECK-NEXT:    .cfi_def_cfa_offset 32
2598; CHECK-NEXT:    pushq %r13
2599; CHECK-NEXT:    .cfi_def_cfa_offset 40
2600; CHECK-NEXT:    pushq %r12
2601; CHECK-NEXT:    .cfi_def_cfa_offset 48
2602; CHECK-NEXT:    pushq %rbx
2603; CHECK-NEXT:    .cfi_def_cfa_offset 56
2604; CHECK-NEXT:    .cfi_offset %rbx, -56
2605; CHECK-NEXT:    .cfi_offset %r12, -48
2606; CHECK-NEXT:    .cfi_offset %r13, -40
2607; CHECK-NEXT:    .cfi_offset %r14, -32
2608; CHECK-NEXT:    .cfi_offset %r15, -24
2609; CHECK-NEXT:    .cfi_offset %rbp, -16
2610; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2611; CHECK-NEXT:    #APP
2612; CHECK-NEXT:    nop
2613; CHECK-NEXT:    #NO_APP
2614; CHECK-NEXT:    vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2615; CHECK-NEXT:    popq %rbx
2616; CHECK-NEXT:    .cfi_def_cfa_offset 48
2617; CHECK-NEXT:    popq %r12
2618; CHECK-NEXT:    .cfi_def_cfa_offset 40
2619; CHECK-NEXT:    popq %r13
2620; CHECK-NEXT:    .cfi_def_cfa_offset 32
2621; CHECK-NEXT:    popq %r14
2622; CHECK-NEXT:    .cfi_def_cfa_offset 24
2623; CHECK-NEXT:    popq %r15
2624; CHECK-NEXT:    .cfi_def_cfa_offset 16
2625; CHECK-NEXT:    popq %rbp
2626; CHECK-NEXT:    .cfi_def_cfa_offset 8
2627; CHECK-NEXT:    retq
2628  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2629  %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
2630  ret <16 x i8> %2
2631}
2632
2633define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
2634; CHECK-LABEL: stack_fold_pinsrd:
2635; CHECK:       # %bb.0:
2636; CHECK-NEXT:    pushq %rbp
2637; CHECK-NEXT:    .cfi_def_cfa_offset 16
2638; CHECK-NEXT:    pushq %r15
2639; CHECK-NEXT:    .cfi_def_cfa_offset 24
2640; CHECK-NEXT:    pushq %r14
2641; CHECK-NEXT:    .cfi_def_cfa_offset 32
2642; CHECK-NEXT:    pushq %r13
2643; CHECK-NEXT:    .cfi_def_cfa_offset 40
2644; CHECK-NEXT:    pushq %r12
2645; CHECK-NEXT:    .cfi_def_cfa_offset 48
2646; CHECK-NEXT:    pushq %rbx
2647; CHECK-NEXT:    .cfi_def_cfa_offset 56
2648; CHECK-NEXT:    .cfi_offset %rbx, -56
2649; CHECK-NEXT:    .cfi_offset %r12, -48
2650; CHECK-NEXT:    .cfi_offset %r13, -40
2651; CHECK-NEXT:    .cfi_offset %r14, -32
2652; CHECK-NEXT:    .cfi_offset %r15, -24
2653; CHECK-NEXT:    .cfi_offset %rbp, -16
2654; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2655; CHECK-NEXT:    #APP
2656; CHECK-NEXT:    nop
2657; CHECK-NEXT:    #NO_APP
2658; CHECK-NEXT:    vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2659; CHECK-NEXT:    popq %rbx
2660; CHECK-NEXT:    .cfi_def_cfa_offset 48
2661; CHECK-NEXT:    popq %r12
2662; CHECK-NEXT:    .cfi_def_cfa_offset 40
2663; CHECK-NEXT:    popq %r13
2664; CHECK-NEXT:    .cfi_def_cfa_offset 32
2665; CHECK-NEXT:    popq %r14
2666; CHECK-NEXT:    .cfi_def_cfa_offset 24
2667; CHECK-NEXT:    popq %r15
2668; CHECK-NEXT:    .cfi_def_cfa_offset 16
2669; CHECK-NEXT:    popq %rbp
2670; CHECK-NEXT:    .cfi_def_cfa_offset 8
2671; CHECK-NEXT:    retq
2672  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2673  %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
2674  ret <4 x i32> %2
2675}
2676
2677define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
2678; CHECK-LABEL: stack_fold_pinsrq:
2679; CHECK:       # %bb.0:
2680; CHECK-NEXT:    pushq %rbp
2681; CHECK-NEXT:    .cfi_def_cfa_offset 16
2682; CHECK-NEXT:    pushq %r15
2683; CHECK-NEXT:    .cfi_def_cfa_offset 24
2684; CHECK-NEXT:    pushq %r14
2685; CHECK-NEXT:    .cfi_def_cfa_offset 32
2686; CHECK-NEXT:    pushq %r13
2687; CHECK-NEXT:    .cfi_def_cfa_offset 40
2688; CHECK-NEXT:    pushq %r12
2689; CHECK-NEXT:    .cfi_def_cfa_offset 48
2690; CHECK-NEXT:    pushq %rbx
2691; CHECK-NEXT:    .cfi_def_cfa_offset 56
2692; CHECK-NEXT:    .cfi_offset %rbx, -56
2693; CHECK-NEXT:    .cfi_offset %r12, -48
2694; CHECK-NEXT:    .cfi_offset %r13, -40
2695; CHECK-NEXT:    .cfi_offset %r14, -32
2696; CHECK-NEXT:    .cfi_offset %r15, -24
2697; CHECK-NEXT:    .cfi_offset %rbp, -16
2698; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2699; CHECK-NEXT:    #APP
2700; CHECK-NEXT:    nop
2701; CHECK-NEXT:    #NO_APP
2702; CHECK-NEXT:    vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2703; CHECK-NEXT:    popq %rbx
2704; CHECK-NEXT:    .cfi_def_cfa_offset 48
2705; CHECK-NEXT:    popq %r12
2706; CHECK-NEXT:    .cfi_def_cfa_offset 40
2707; CHECK-NEXT:    popq %r13
2708; CHECK-NEXT:    .cfi_def_cfa_offset 32
2709; CHECK-NEXT:    popq %r14
2710; CHECK-NEXT:    .cfi_def_cfa_offset 24
2711; CHECK-NEXT:    popq %r15
2712; CHECK-NEXT:    .cfi_def_cfa_offset 16
2713; CHECK-NEXT:    popq %rbp
2714; CHECK-NEXT:    .cfi_def_cfa_offset 8
2715; CHECK-NEXT:    retq
2716  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2717  %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
2718  ret <2 x i64> %2
2719}
2720
2721define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
2722; CHECK-LABEL: stack_fold_pinsrw:
2723; CHECK:       # %bb.0:
2724; CHECK-NEXT:    pushq %rbp
2725; CHECK-NEXT:    .cfi_def_cfa_offset 16
2726; CHECK-NEXT:    pushq %r15
2727; CHECK-NEXT:    .cfi_def_cfa_offset 24
2728; CHECK-NEXT:    pushq %r14
2729; CHECK-NEXT:    .cfi_def_cfa_offset 32
2730; CHECK-NEXT:    pushq %r13
2731; CHECK-NEXT:    .cfi_def_cfa_offset 40
2732; CHECK-NEXT:    pushq %r12
2733; CHECK-NEXT:    .cfi_def_cfa_offset 48
2734; CHECK-NEXT:    pushq %rbx
2735; CHECK-NEXT:    .cfi_def_cfa_offset 56
2736; CHECK-NEXT:    .cfi_offset %rbx, -56
2737; CHECK-NEXT:    .cfi_offset %r12, -48
2738; CHECK-NEXT:    .cfi_offset %r13, -40
2739; CHECK-NEXT:    .cfi_offset %r14, -32
2740; CHECK-NEXT:    .cfi_offset %r15, -24
2741; CHECK-NEXT:    .cfi_offset %rbp, -16
2742; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2743; CHECK-NEXT:    #APP
2744; CHECK-NEXT:    nop
2745; CHECK-NEXT:    #NO_APP
2746; CHECK-NEXT:    vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2747; CHECK-NEXT:    popq %rbx
2748; CHECK-NEXT:    .cfi_def_cfa_offset 48
2749; CHECK-NEXT:    popq %r12
2750; CHECK-NEXT:    .cfi_def_cfa_offset 40
2751; CHECK-NEXT:    popq %r13
2752; CHECK-NEXT:    .cfi_def_cfa_offset 32
2753; CHECK-NEXT:    popq %r14
2754; CHECK-NEXT:    .cfi_def_cfa_offset 24
2755; CHECK-NEXT:    popq %r15
2756; CHECK-NEXT:    .cfi_def_cfa_offset 16
2757; CHECK-NEXT:    popq %rbp
2758; CHECK-NEXT:    .cfi_def_cfa_offset 8
2759; CHECK-NEXT:    retq
2760  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2761  %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
2762  ret <8 x i16> %2
2763}
2764
2765define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) {
2766; CHECK-LABEL: stack_fold_vplzcntd:
2767; CHECK:       # %bb.0:
2768; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2769; CHECK-NEXT:    #APP
2770; CHECK-NEXT:    nop
2771; CHECK-NEXT:    #NO_APP
2772; CHECK-NEXT:    vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2773; CHECK-NEXT:    retq
2774  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2775  %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0, i1 false)
2776  ret <16 x i32> %2
2777}
2778
2779define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) {
2780; CHECK-LABEL: stack_fold_vplzcntq:
2781; CHECK:       # %bb.0:
2782; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2783; CHECK-NEXT:    #APP
2784; CHECK-NEXT:    nop
2785; CHECK-NEXT:    #NO_APP
2786; CHECK-NEXT:    vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2787; CHECK-NEXT:    retq
2788  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2789  %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0, i1 false)
2790  ret <8 x i64> %2
2791}
2792
2793define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
2794; CHECK-LABEL: stack_fold_pmaddubsw_zmm:
2795; CHECK:       # %bb.0:
2796; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2797; CHECK-NEXT:    #APP
2798; CHECK-NEXT:    nop
2799; CHECK-NEXT:    #NO_APP
2800; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2801; CHECK-NEXT:    retq
2802  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2803  %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2804  ret <32 x i16> %2
2805}
2806declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) nounwind readnone
2807
2808define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
2809; CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask:
2810; CHECK:       # %bb.0:
2811; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2812; CHECK-NEXT:    #APP
2813; CHECK-NEXT:    nop
2814; CHECK-NEXT:    #NO_APP
2815; CHECK-NEXT:    kmovd %esi, %k1
2816; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2817; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2818; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2819; CHECK-NEXT:    retq
2820  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2821  %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2822  %3 = bitcast i32 %mask to <32 x i1>
2823  ; load needed to keep the operation from being scheduled about the asm block
2824  %4 = load <32 x i16>, ptr %passthru
2825  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
2826  ret <32 x i16> %5
2827}
2828
2829define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
2830; CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz:
2831; CHECK:       # %bb.0:
2832; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2833; CHECK-NEXT:    #APP
2834; CHECK-NEXT:    nop
2835; CHECK-NEXT:    #NO_APP
2836; CHECK-NEXT:    kmovd %edi, %k1
2837; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2838; CHECK-NEXT:    retq
2839  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2840  %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2841  %3 = bitcast i32 %mask to <32 x i1>
2842  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
2843  ret <32 x i16> %4
2844}
2845
2846define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) {
2847; CHECK-LABEL: stack_fold_pmaddwd_zmm:
2848; CHECK:       # %bb.0:
2849; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2850; CHECK-NEXT:    #APP
2851; CHECK-NEXT:    nop
2852; CHECK-NEXT:    #NO_APP
2853; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2854; CHECK-NEXT:    retq
2855  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2856  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2857  ret <16 x i32> %2
2858}
2859declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) nounwind readnone
2860
2861define <16 x i32> @stack_fold_pmaddwd_zmm_commuted(<32 x i16> %a0, <32 x i16> %a1) {
2862; CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted:
2863; CHECK:       # %bb.0:
2864; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2865; CHECK-NEXT:    #APP
2866; CHECK-NEXT:    nop
2867; CHECK-NEXT:    #NO_APP
2868; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2869; CHECK-NEXT:    retq
2870  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2871  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2872  ret <16 x i32> %2
2873}
2874
2875define <16 x i32> @stack_fold_pmaddwd_zmm_mask(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2876; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask:
2877; CHECK:       # %bb.0:
2878; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2879; CHECK-NEXT:    #APP
2880; CHECK-NEXT:    nop
2881; CHECK-NEXT:    #NO_APP
2882; CHECK-NEXT:    kmovd %esi, %k1
2883; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2884; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2885; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2886; CHECK-NEXT:    retq
2887  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2888  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2889  %3 = bitcast i16 %mask to <16 x i1>
2890  ; load needed to keep the operation from being scheduled about the asm block
2891  %4 = load <16 x i32>, ptr %passthru
2892  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
2893  ret <16 x i32> %5
2894}
2895
2896define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2897; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted:
2898; CHECK:       # %bb.0:
2899; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2900; CHECK-NEXT:    #APP
2901; CHECK-NEXT:    nop
2902; CHECK-NEXT:    #NO_APP
2903; CHECK-NEXT:    kmovd %esi, %k1
2904; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2905; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2906; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2907; CHECK-NEXT:    retq
2908  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2909  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2910  %3 = bitcast i16 %mask to <16 x i1>
2911  ; load needed to keep the operation from being scheduled about the asm block
2912  %4 = load <16 x i32>, ptr %passthru
2913  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
2914  ret <16 x i32> %5
2915}
2916
2917define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2918; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz:
2919; CHECK:       # %bb.0:
2920; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2921; CHECK-NEXT:    #APP
2922; CHECK-NEXT:    nop
2923; CHECK-NEXT:    #NO_APP
2924; CHECK-NEXT:    kmovd %esi, %k1
2925; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2926; CHECK-NEXT:    retq
2927  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2928  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2929  %3 = bitcast i16 %mask to <16 x i1>
2930  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
2931  ret <16 x i32> %4
2932}
2933
2934define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2935; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted:
2936; CHECK:       # %bb.0:
2937; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2938; CHECK-NEXT:    #APP
2939; CHECK-NEXT:    nop
2940; CHECK-NEXT:    #NO_APP
2941; CHECK-NEXT:    kmovd %esi, %k1
2942; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2943; CHECK-NEXT:    retq
2944  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2945  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2946  %3 = bitcast i16 %mask to <16 x i1>
2947  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
2948  ret <16 x i32> %4
2949}
2950
2951define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) {
2952; CHECK-LABEL: stack_fold_pmaxsb:
2953; CHECK:       # %bb.0:
2954; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2955; CHECK-NEXT:    #APP
2956; CHECK-NEXT:    nop
2957; CHECK-NEXT:    #NO_APP
2958; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2959; CHECK-NEXT:    retq
2960  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2961  %2 = icmp sgt <64 x i8> %a0, %a1
2962  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
2963  ret <64 x i8> %3
2964}
2965
2966define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
2967; CHECK-LABEL: stack_fold_pmaxsb_commuted:
2968; CHECK:       # %bb.0:
2969; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2970; CHECK-NEXT:    #APP
2971; CHECK-NEXT:    nop
2972; CHECK-NEXT:    #NO_APP
2973; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2974; CHECK-NEXT:    retq
2975  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2976  %2 = icmp sgt <64 x i8> %a1, %a0
2977  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
2978  ret <64 x i8> %3
2979}
2980
2981define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
2982; CHECK-LABEL: stack_fold_pmaxsb_mask:
2983; CHECK:       # %bb.0:
2984; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2985; CHECK-NEXT:    #APP
2986; CHECK-NEXT:    nop
2987; CHECK-NEXT:    #NO_APP
2988; CHECK-NEXT:    kmovq %rdi, %k1
2989; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
2990; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2991; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2992; CHECK-NEXT:    retq
2993  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2994  %2 = icmp sgt <64 x i8> %a0, %a1
2995  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
2996  %4 = bitcast i64 %mask to <64 x i1>
2997  ; load needed to keep the operation from being scheduled about the asm block
2998  %5 = load <64 x i8>, ptr %passthru
2999  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3000  ret <64 x i8> %6
3001}
3002
3003define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3004; CHECK-LABEL: stack_fold_pmaxsb_mask_commuted:
3005; CHECK:       # %bb.0:
3006; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3007; CHECK-NEXT:    #APP
3008; CHECK-NEXT:    nop
3009; CHECK-NEXT:    #NO_APP
3010; CHECK-NEXT:    kmovq %rdi, %k1
3011; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3012; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3013; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3014; CHECK-NEXT:    retq
3015  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3016  %2 = icmp sgt <64 x i8> %a1, %a0
3017  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3018  %4 = bitcast i64 %mask to <64 x i1>
3019  ; load needed to keep the operation from being scheduled about the asm block
3020  %5 = load <64 x i8>, ptr %passthru
3021  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3022  ret <64 x i8> %6
3023}
3024
3025define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3026; CHECK-LABEL: stack_fold_pmaxsb_maskz:
3027; CHECK:       # %bb.0:
3028; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3029; CHECK-NEXT:    #APP
3030; CHECK-NEXT:    nop
3031; CHECK-NEXT:    #NO_APP
3032; CHECK-NEXT:    kmovq %rdi, %k1
3033; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3034; CHECK-NEXT:    retq
3035  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3036  %2 = icmp sgt <64 x i8> %a0, %a1
3037  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3038  %4 = bitcast i64 %mask to <64 x i1>
3039  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3040  ret <64 x i8> %5
3041}
3042
3043define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3044; CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted:
3045; CHECK:       # %bb.0:
3046; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3047; CHECK-NEXT:    #APP
3048; CHECK-NEXT:    nop
3049; CHECK-NEXT:    #NO_APP
3050; CHECK-NEXT:    kmovq %rdi, %k1
3051; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3052; CHECK-NEXT:    retq
3053  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3054  %2 = icmp sgt <64 x i8> %a1, %a0
3055  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3056  %4 = bitcast i64 %mask to <64 x i1>
3057  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3058  ret <64 x i8> %5
3059}
3060
3061define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
3062; CHECK-LABEL: stack_fold_pmaxsd:
3063; CHECK:       # %bb.0:
3064; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3065; CHECK-NEXT:    #APP
3066; CHECK-NEXT:    nop
3067; CHECK-NEXT:    #NO_APP
3068; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3069; CHECK-NEXT:    retq
3070  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3071  %2 = icmp sgt <16 x i32> %a0, %a1
3072  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3073  ret <16 x i32> %3
3074}
3075
3076define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3077; CHECK-LABEL: stack_fold_pmaxsd_commuted:
3078; CHECK:       # %bb.0:
3079; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3080; CHECK-NEXT:    #APP
3081; CHECK-NEXT:    nop
3082; CHECK-NEXT:    #NO_APP
3083; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3084; CHECK-NEXT:    retq
3085  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3086  %2 = icmp sgt <16 x i32> %a1, %a0
3087  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3088  ret <16 x i32> %3
3089}
3090
3091define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3092; CHECK-LABEL: stack_fold_pmaxsd_mask:
3093; CHECK:       # %bb.0:
3094; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3095; CHECK-NEXT:    #APP
3096; CHECK-NEXT:    nop
3097; CHECK-NEXT:    #NO_APP
3098; CHECK-NEXT:    kmovd %edi, %k1
3099; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3100; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3101; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3102; CHECK-NEXT:    retq
3103  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3104  %2 = icmp sgt <16 x i32> %a0, %a1
3105  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3106  %4 = bitcast i16 %mask to <16 x i1>
3107  ; load needed to keep the operation from being scheduled about the asm block
3108  %5 = load <16 x i32>, ptr %passthru
3109  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3110  ret <16 x i32> %6
3111}
3112
3113define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3114; CHECK-LABEL: stack_fold_pmaxsd_mask_commuted:
3115; CHECK:       # %bb.0:
3116; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3117; CHECK-NEXT:    #APP
3118; CHECK-NEXT:    nop
3119; CHECK-NEXT:    #NO_APP
3120; CHECK-NEXT:    kmovd %edi, %k1
3121; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3122; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3123; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3124; CHECK-NEXT:    retq
3125  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3126  %2 = icmp sgt <16 x i32> %a1, %a0
3127  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3128  %4 = bitcast i16 %mask to <16 x i1>
3129  ; load needed to keep the operation from being scheduled about the asm block
3130  %5 = load <16 x i32>, ptr %passthru
3131  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3132  ret <16 x i32> %6
3133}
3134
3135define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3136; CHECK-LABEL: stack_fold_pmaxsd_maskz:
3137; CHECK:       # %bb.0:
3138; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3139; CHECK-NEXT:    #APP
3140; CHECK-NEXT:    nop
3141; CHECK-NEXT:    #NO_APP
3142; CHECK-NEXT:    kmovd %edi, %k1
3143; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3144; CHECK-NEXT:    retq
3145  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3146  %2 = icmp sgt <16 x i32> %a0, %a1
3147  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3148  %4 = bitcast i16 %mask to <16 x i1>
3149  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3150  ret <16 x i32> %5
3151}
3152
3153define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3154; CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted:
3155; CHECK:       # %bb.0:
3156; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3157; CHECK-NEXT:    #APP
3158; CHECK-NEXT:    nop
3159; CHECK-NEXT:    #NO_APP
3160; CHECK-NEXT:    kmovd %edi, %k1
3161; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3162; CHECK-NEXT:    retq
3163  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3164  %2 = icmp sgt <16 x i32> %a1, %a0
3165  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3166  %4 = bitcast i16 %mask to <16 x i1>
3167  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3168  ret <16 x i32> %5
3169}
3170
3171define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) {
3172; CHECK-LABEL: stack_fold_pmaxsq:
3173; CHECK:       # %bb.0:
3174; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3175; CHECK-NEXT:    #APP
3176; CHECK-NEXT:    nop
3177; CHECK-NEXT:    #NO_APP
3178; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3179; CHECK-NEXT:    retq
3180  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3181  %2 = icmp sgt <8 x i64> %a0, %a1
3182  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3183  ret <8 x i64> %3
3184}
3185
3186define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
3187; CHECK-LABEL: stack_fold_pmaxsq_commuted:
3188; CHECK:       # %bb.0:
3189; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3190; CHECK-NEXT:    #APP
3191; CHECK-NEXT:    nop
3192; CHECK-NEXT:    #NO_APP
3193; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3194; CHECK-NEXT:    retq
3195  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3196  %2 = icmp sgt <8 x i64> %a1, %a0
3197  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3198  ret <8 x i64> %3
3199}
3200
3201define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
3202; CHECK-LABEL: stack_fold_pmaxsq_mask:
3203; CHECK:       # %bb.0:
3204; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3205; CHECK-NEXT:    #APP
3206; CHECK-NEXT:    nop
3207; CHECK-NEXT:    #NO_APP
3208; CHECK-NEXT:    kmovd %edi, %k1
3209; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3210; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3211; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3212; CHECK-NEXT:    retq
3213  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3214  %2 = icmp sgt <8 x i64> %a0, %a1
3215  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3216  %4 = bitcast i8 %mask to <8 x i1>
3217  ; load needed to keep the operation from being scheduled about the asm block
3218  %5 = load <8 x i64>, ptr %passthru
3219  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3220  ret <8 x i64> %6
3221}
3222
3223define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
3224; CHECK-LABEL: stack_fold_pmaxsq_mask_commuted:
3225; CHECK:       # %bb.0:
3226; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3227; CHECK-NEXT:    #APP
3228; CHECK-NEXT:    nop
3229; CHECK-NEXT:    #NO_APP
3230; CHECK-NEXT:    kmovd %edi, %k1
3231; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3232; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3233; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3234; CHECK-NEXT:    retq
3235  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3236  %2 = icmp sgt <8 x i64> %a1, %a0
3237  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3238  %4 = bitcast i8 %mask to <8 x i1>
3239  ; load needed to keep the operation from being scheduled about the asm block
3240  %5 = load <8 x i64>, ptr %passthru
3241  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3242  ret <8 x i64> %6
3243}
3244
3245define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3246; CHECK-LABEL: stack_fold_pmaxsq_maskz:
3247; CHECK:       # %bb.0:
3248; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3249; CHECK-NEXT:    #APP
3250; CHECK-NEXT:    nop
3251; CHECK-NEXT:    #NO_APP
3252; CHECK-NEXT:    kmovd %edi, %k1
3253; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3254; CHECK-NEXT:    retq
3255  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3256  %2 = icmp sgt <8 x i64> %a0, %a1
3257  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3258  %4 = bitcast i8 %mask to <8 x i1>
3259  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3260  ret <8 x i64> %5
3261}
3262
3263define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3264; CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted:
3265; CHECK:       # %bb.0:
3266; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3267; CHECK-NEXT:    #APP
3268; CHECK-NEXT:    nop
3269; CHECK-NEXT:    #NO_APP
3270; CHECK-NEXT:    kmovd %edi, %k1
3271; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3272; CHECK-NEXT:    retq
3273  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3274  %2 = icmp sgt <8 x i64> %a1, %a0
3275  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3276  %4 = bitcast i8 %mask to <8 x i1>
3277  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3278  ret <8 x i64> %5
3279}
3280
3281define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) {
3282; CHECK-LABEL: stack_fold_pmaxsw:
3283; CHECK:       # %bb.0:
3284; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3285; CHECK-NEXT:    #APP
3286; CHECK-NEXT:    nop
3287; CHECK-NEXT:    #NO_APP
3288; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3289; CHECK-NEXT:    retq
3290  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3291  %2 = icmp sgt <32 x i16> %a0, %a1
3292  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3293  ret <32 x i16> %3
3294}
3295
3296define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
3297; CHECK-LABEL: stack_fold_pmaxsw_commuted:
3298; CHECK:       # %bb.0:
3299; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3300; CHECK-NEXT:    #APP
3301; CHECK-NEXT:    nop
3302; CHECK-NEXT:    #NO_APP
3303; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3304; CHECK-NEXT:    retq
3305  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3306  %2 = icmp sgt <32 x i16> %a1, %a0
3307  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3308  ret <32 x i16> %3
3309}
3310
3311define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
3312; CHECK-LABEL: stack_fold_pmaxsw_mask:
3313; CHECK:       # %bb.0:
3314; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3315; CHECK-NEXT:    #APP
3316; CHECK-NEXT:    nop
3317; CHECK-NEXT:    #NO_APP
3318; CHECK-NEXT:    kmovd %edi, %k1
3319; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3320; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3321; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3322; CHECK-NEXT:    retq
3323  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3324  %2 = icmp sgt <32 x i16> %a0, %a1
3325  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3326  %4 = bitcast i32 %mask to <32 x i1>
3327  ; load needed to keep the operation from being scheduled about the asm block
3328  %5 = load <32 x i16>, ptr %passthru
3329  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3330  ret <32 x i16> %6
3331}
3332
3333define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
3334; CHECK-LABEL: stack_fold_pmaxsw_mask_commuted:
3335; CHECK:       # %bb.0:
3336; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3337; CHECK-NEXT:    #APP
3338; CHECK-NEXT:    nop
3339; CHECK-NEXT:    #NO_APP
3340; CHECK-NEXT:    kmovd %edi, %k1
3341; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3342; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3343; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3344; CHECK-NEXT:    retq
3345  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3346  %2 = icmp sgt <32 x i16> %a1, %a0
3347  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3348  %4 = bitcast i32 %mask to <32 x i1>
3349  ; load needed to keep the operation from being scheduled about the asm block
3350  %5 = load <32 x i16>, ptr %passthru
3351  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3352  ret <32 x i16> %6
3353}
3354
3355define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3356; CHECK-LABEL: stack_fold_pmaxsw_maskz:
3357; CHECK:       # %bb.0:
3358; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3359; CHECK-NEXT:    #APP
3360; CHECK-NEXT:    nop
3361; CHECK-NEXT:    #NO_APP
3362; CHECK-NEXT:    kmovd %edi, %k1
3363; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3364; CHECK-NEXT:    retq
3365  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3366  %2 = icmp sgt <32 x i16> %a0, %a1
3367  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3368  %4 = bitcast i32 %mask to <32 x i1>
3369  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3370  ret <32 x i16> %5
3371}
3372
3373define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3374; CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted:
3375; CHECK:       # %bb.0:
3376; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3377; CHECK-NEXT:    #APP
3378; CHECK-NEXT:    nop
3379; CHECK-NEXT:    #NO_APP
3380; CHECK-NEXT:    kmovd %edi, %k1
3381; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3382; CHECK-NEXT:    retq
3383  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3384  %2 = icmp sgt <32 x i16> %a1, %a0
3385  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3386  %4 = bitcast i32 %mask to <32 x i1>
3387  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3388  ret <32 x i16> %5
3389}
3390
3391define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) {
3392; CHECK-LABEL: stack_fold_pmaxub:
3393; CHECK:       # %bb.0:
3394; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3395; CHECK-NEXT:    #APP
3396; CHECK-NEXT:    nop
3397; CHECK-NEXT:    #NO_APP
3398; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3399; CHECK-NEXT:    retq
3400  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3401  %2 = icmp ugt <64 x i8> %a0, %a1
3402  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3403  ret <64 x i8> %3
3404}
3405
3406define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
3407; CHECK-LABEL: stack_fold_pmaxub_commuted:
3408; CHECK:       # %bb.0:
3409; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3410; CHECK-NEXT:    #APP
3411; CHECK-NEXT:    nop
3412; CHECK-NEXT:    #NO_APP
3413; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3414; CHECK-NEXT:    retq
3415  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3416  %2 = icmp ugt <64 x i8> %a1, %a0
3417  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3418  ret <64 x i8> %3
3419}
3420
3421define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3422; CHECK-LABEL: stack_fold_pmaxub_mask:
3423; CHECK:       # %bb.0:
3424; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3425; CHECK-NEXT:    #APP
3426; CHECK-NEXT:    nop
3427; CHECK-NEXT:    #NO_APP
3428; CHECK-NEXT:    kmovq %rdi, %k1
3429; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3430; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3431; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3432; CHECK-NEXT:    retq
3433  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3434  %2 = icmp ugt <64 x i8> %a0, %a1
3435  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3436  %4 = bitcast i64 %mask to <64 x i1>
3437  ; load needed to keep the operation from being scheduled about the asm block
3438  %5 = load <64 x i8>, ptr %passthru
3439  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3440  ret <64 x i8> %6
3441}
3442
3443define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3444; CHECK-LABEL: stack_fold_pmaxub_mask_commuted:
3445; CHECK:       # %bb.0:
3446; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3447; CHECK-NEXT:    #APP
3448; CHECK-NEXT:    nop
3449; CHECK-NEXT:    #NO_APP
3450; CHECK-NEXT:    kmovq %rdi, %k1
3451; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3452; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3453; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3454; CHECK-NEXT:    retq
3455  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3456  %2 = icmp ugt <64 x i8> %a1, %a0
3457  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3458  %4 = bitcast i64 %mask to <64 x i1>
3459  ; load needed to keep the operation from being scheduled about the asm block
3460  %5 = load <64 x i8>, ptr %passthru
3461  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3462  ret <64 x i8> %6
3463}
3464
3465define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3466; CHECK-LABEL: stack_fold_pmaxub_maskz:
3467; CHECK:       # %bb.0:
3468; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3469; CHECK-NEXT:    #APP
3470; CHECK-NEXT:    nop
3471; CHECK-NEXT:    #NO_APP
3472; CHECK-NEXT:    kmovq %rdi, %k1
3473; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3474; CHECK-NEXT:    retq
3475  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3476  %2 = icmp ugt <64 x i8> %a0, %a1
3477  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3478  %4 = bitcast i64 %mask to <64 x i1>
3479  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3480  ret <64 x i8> %5
3481}
3482
3483define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3484; CHECK-LABEL: stack_fold_pmaxub_maskz_commuted:
3485; CHECK:       # %bb.0:
3486; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3487; CHECK-NEXT:    #APP
3488; CHECK-NEXT:    nop
3489; CHECK-NEXT:    #NO_APP
3490; CHECK-NEXT:    kmovq %rdi, %k1
3491; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3492; CHECK-NEXT:    retq
3493  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3494  %2 = icmp ugt <64 x i8> %a1, %a0
3495  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3496  %4 = bitcast i64 %mask to <64 x i1>
3497  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3498  ret <64 x i8> %5
3499}
3500
3501define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) {
3502; CHECK-LABEL: stack_fold_pmaxud:
3503; CHECK:       # %bb.0:
3504; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3505; CHECK-NEXT:    #APP
3506; CHECK-NEXT:    nop
3507; CHECK-NEXT:    #NO_APP
3508; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3509; CHECK-NEXT:    retq
3510  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3511  %2 = icmp ugt <16 x i32> %a0, %a1
3512  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3513  ret <16 x i32> %3
3514}
3515
3516define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3517; CHECK-LABEL: stack_fold_pmaxud_commuted:
3518; CHECK:       # %bb.0:
3519; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3520; CHECK-NEXT:    #APP
3521; CHECK-NEXT:    nop
3522; CHECK-NEXT:    #NO_APP
3523; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3524; CHECK-NEXT:    retq
3525  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3526  %2 = icmp ugt <16 x i32> %a1, %a0
3527  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3528  ret <16 x i32> %3
3529}
3530
3531define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3532; CHECK-LABEL: stack_fold_pmaxud_mask:
3533; CHECK:       # %bb.0:
3534; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3535; CHECK-NEXT:    #APP
3536; CHECK-NEXT:    nop
3537; CHECK-NEXT:    #NO_APP
3538; CHECK-NEXT:    kmovd %edi, %k1
3539; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3540; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3541; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3542; CHECK-NEXT:    retq
3543  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3544  %2 = icmp ugt <16 x i32> %a0, %a1
3545  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3546  %4 = bitcast i16 %mask to <16 x i1>
3547  ; load needed to keep the operation from being scheduled about the asm block
3548  %5 = load <16 x i32>, ptr %passthru
3549  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3550  ret <16 x i32> %6
3551}
3552
3553define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3554; CHECK-LABEL: stack_fold_pmaxud_mask_commuted:
3555; CHECK:       # %bb.0:
3556; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3557; CHECK-NEXT:    #APP
3558; CHECK-NEXT:    nop
3559; CHECK-NEXT:    #NO_APP
3560; CHECK-NEXT:    kmovd %edi, %k1
3561; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3562; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3563; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3564; CHECK-NEXT:    retq
3565  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3566  %2 = icmp ugt <16 x i32> %a1, %a0
3567  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3568  %4 = bitcast i16 %mask to <16 x i1>
3569  ; load needed to keep the operation from being scheduled about the asm block
3570  %5 = load <16 x i32>, ptr %passthru
3571  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3572  ret <16 x i32> %6
3573}
3574
3575define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3576; CHECK-LABEL: stack_fold_pmaxud_maskz:
3577; CHECK:       # %bb.0:
3578; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3579; CHECK-NEXT:    #APP
3580; CHECK-NEXT:    nop
3581; CHECK-NEXT:    #NO_APP
3582; CHECK-NEXT:    kmovd %edi, %k1
3583; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3584; CHECK-NEXT:    retq
3585  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3586  %2 = icmp ugt <16 x i32> %a0, %a1
3587  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3588  %4 = bitcast i16 %mask to <16 x i1>
3589  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3590  ret <16 x i32> %5
3591}
3592
3593define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3594; CHECK-LABEL: stack_fold_pmaxud_maskz_commuted:
3595; CHECK:       # %bb.0:
3596; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3597; CHECK-NEXT:    #APP
3598; CHECK-NEXT:    nop
3599; CHECK-NEXT:    #NO_APP
3600; CHECK-NEXT:    kmovd %edi, %k1
3601; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3602; CHECK-NEXT:    retq
3603  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3604  %2 = icmp ugt <16 x i32> %a1, %a0
3605  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3606  %4 = bitcast i16 %mask to <16 x i1>
3607  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3608  ret <16 x i32> %5
3609}
3610
3611define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) {
3612; CHECK-LABEL: stack_fold_pmaxuq:
3613; CHECK:       # %bb.0:
3614; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3615; CHECK-NEXT:    #APP
3616; CHECK-NEXT:    nop
3617; CHECK-NEXT:    #NO_APP
3618; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3619; CHECK-NEXT:    retq
3620  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3621  %2 = icmp ugt <8 x i64> %a0, %a1
3622  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3623  ret <8 x i64> %3
3624}
3625
3626define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
3627; CHECK-LABEL: stack_fold_pmaxuq_commuted:
3628; CHECK:       # %bb.0:
3629; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3630; CHECK-NEXT:    #APP
3631; CHECK-NEXT:    nop
3632; CHECK-NEXT:    #NO_APP
3633; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3634; CHECK-NEXT:    retq
3635  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3636  %2 = icmp ugt <8 x i64> %a1, %a0
3637  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3638  ret <8 x i64> %3
3639}
3640
3641define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
3642; CHECK-LABEL: stack_fold_pmaxuq_mask:
3643; CHECK:       # %bb.0:
3644; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3645; CHECK-NEXT:    #APP
3646; CHECK-NEXT:    nop
3647; CHECK-NEXT:    #NO_APP
3648; CHECK-NEXT:    kmovd %edi, %k1
3649; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3650; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3651; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3652; CHECK-NEXT:    retq
3653  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3654  %2 = icmp ugt <8 x i64> %a0, %a1
3655  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3656  %4 = bitcast i8 %mask to <8 x i1>
3657  ; load needed to keep the operation from being scheduled about the asm block
3658  %5 = load <8 x i64>, ptr %passthru
3659  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3660  ret <8 x i64> %6
3661}
3662
3663define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
3664; CHECK-LABEL: stack_fold_pmaxuq_mask_commuted:
3665; CHECK:       # %bb.0:
3666; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3667; CHECK-NEXT:    #APP
3668; CHECK-NEXT:    nop
3669; CHECK-NEXT:    #NO_APP
3670; CHECK-NEXT:    kmovd %edi, %k1
3671; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3672; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3673; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3674; CHECK-NEXT:    retq
3675  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3676  %2 = icmp ugt <8 x i64> %a1, %a0
3677  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3678  %4 = bitcast i8 %mask to <8 x i1>
3679  ; load needed to keep the operation from being scheduled about the asm block
3680  %5 = load <8 x i64>, ptr %passthru
3681  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3682  ret <8 x i64> %6
3683}
3684
3685define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3686; CHECK-LABEL: stack_fold_pmaxuq_maskz:
3687; CHECK:       # %bb.0:
3688; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3689; CHECK-NEXT:    #APP
3690; CHECK-NEXT:    nop
3691; CHECK-NEXT:    #NO_APP
3692; CHECK-NEXT:    kmovd %edi, %k1
3693; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3694; CHECK-NEXT:    retq
3695  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3696  %2 = icmp ugt <8 x i64> %a0, %a1
3697  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3698  %4 = bitcast i8 %mask to <8 x i1>
3699  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3700  ret <8 x i64> %5
3701}
3702
3703define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3704; CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted:
3705; CHECK:       # %bb.0:
3706; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3707; CHECK-NEXT:    #APP
3708; CHECK-NEXT:    nop
3709; CHECK-NEXT:    #NO_APP
3710; CHECK-NEXT:    kmovd %edi, %k1
3711; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3712; CHECK-NEXT:    retq
3713  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3714  %2 = icmp ugt <8 x i64> %a1, %a0
3715  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3716  %4 = bitcast i8 %mask to <8 x i1>
3717  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3718  ret <8 x i64> %5
3719}
3720
3721define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) {
3722; CHECK-LABEL: stack_fold_pmaxuw:
3723; CHECK:       # %bb.0:
3724; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3725; CHECK-NEXT:    #APP
3726; CHECK-NEXT:    nop
3727; CHECK-NEXT:    #NO_APP
3728; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3729; CHECK-NEXT:    retq
3730  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3731  %2 = icmp ugt <32 x i16> %a0, %a1
3732  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3733  ret <32 x i16> %3
3734}
3735
3736define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
3737; CHECK-LABEL: stack_fold_pmaxuw_commuted:
3738; CHECK:       # %bb.0:
3739; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3740; CHECK-NEXT:    #APP
3741; CHECK-NEXT:    nop
3742; CHECK-NEXT:    #NO_APP
3743; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3744; CHECK-NEXT:    retq
3745  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3746  %2 = icmp ugt <32 x i16> %a1, %a0
3747  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3748  ret <32 x i16> %3
3749}
3750
3751define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
3752; CHECK-LABEL: stack_fold_pmaxuw_mask:
3753; CHECK:       # %bb.0:
3754; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3755; CHECK-NEXT:    #APP
3756; CHECK-NEXT:    nop
3757; CHECK-NEXT:    #NO_APP
3758; CHECK-NEXT:    kmovd %edi, %k1
3759; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3760; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3761; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3762; CHECK-NEXT:    retq
3763  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3764  %2 = icmp ugt <32 x i16> %a0, %a1
3765  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3766  %4 = bitcast i32 %mask to <32 x i1>
3767  ; load needed to keep the operation from being scheduled about the asm block
3768  %5 = load <32 x i16>, ptr %passthru
3769  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3770  ret <32 x i16> %6
3771}
3772
3773define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
3774; CHECK-LABEL: stack_fold_pmaxuw_mask_commuted:
3775; CHECK:       # %bb.0:
3776; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3777; CHECK-NEXT:    #APP
3778; CHECK-NEXT:    nop
3779; CHECK-NEXT:    #NO_APP
3780; CHECK-NEXT:    kmovd %edi, %k1
3781; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3782; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3783; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3784; CHECK-NEXT:    retq
3785  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3786  %2 = icmp ugt <32 x i16> %a1, %a0
3787  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3788  %4 = bitcast i32 %mask to <32 x i1>
3789  ; load needed to keep the operation from being scheduled about the asm block
3790  %5 = load <32 x i16>, ptr %passthru
3791  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3792  ret <32 x i16> %6
3793}
3794
3795define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3796; CHECK-LABEL: stack_fold_pmaxuw_maskz:
3797; CHECK:       # %bb.0:
3798; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3799; CHECK-NEXT:    #APP
3800; CHECK-NEXT:    nop
3801; CHECK-NEXT:    #NO_APP
3802; CHECK-NEXT:    kmovd %edi, %k1
3803; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3804; CHECK-NEXT:    retq
3805  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3806  %2 = icmp ugt <32 x i16> %a0, %a1
3807  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3808  %4 = bitcast i32 %mask to <32 x i1>
3809  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3810  ret <32 x i16> %5
3811}
3812
3813define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3814; CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted:
3815; CHECK:       # %bb.0:
3816; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3817; CHECK-NEXT:    #APP
3818; CHECK-NEXT:    nop
3819; CHECK-NEXT:    #NO_APP
3820; CHECK-NEXT:    kmovd %edi, %k1
3821; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3822; CHECK-NEXT:    retq
3823  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3824  %2 = icmp ugt <32 x i16> %a1, %a0
3825  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3826  %4 = bitcast i32 %mask to <32 x i1>
3827  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3828  ret <32 x i16> %5
3829}
3830
3831define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) {
3832; CHECK-LABEL: stack_fold_pminsb:
3833; CHECK:       # %bb.0:
3834; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3835; CHECK-NEXT:    #APP
3836; CHECK-NEXT:    nop
3837; CHECK-NEXT:    #NO_APP
3838; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3839; CHECK-NEXT:    retq
3840  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3841  %2 = icmp slt <64 x i8> %a0, %a1
3842  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3843  ret <64 x i8> %3
3844}
3845
3846define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
3847; CHECK-LABEL: stack_fold_pminsb_commuted:
3848; CHECK:       # %bb.0:
3849; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3850; CHECK-NEXT:    #APP
3851; CHECK-NEXT:    nop
3852; CHECK-NEXT:    #NO_APP
3853; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3854; CHECK-NEXT:    retq
3855  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3856  %2 = icmp slt <64 x i8> %a1, %a0
3857  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3858  ret <64 x i8> %3
3859}
3860
3861define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3862; CHECK-LABEL: stack_fold_pminsb_mask:
3863; CHECK:       # %bb.0:
3864; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3865; CHECK-NEXT:    #APP
3866; CHECK-NEXT:    nop
3867; CHECK-NEXT:    #NO_APP
3868; CHECK-NEXT:    kmovq %rdi, %k1
3869; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3870; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3871; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3872; CHECK-NEXT:    retq
3873  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3874  %2 = icmp slt <64 x i8> %a0, %a1
3875  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3876  %4 = bitcast i64 %mask to <64 x i1>
3877  ; load needed to keep the operation from being scheduled about the asm block
3878  %5 = load <64 x i8>, ptr %passthru
3879  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3880  ret <64 x i8> %6
3881}
3882
3883define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3884; CHECK-LABEL: stack_fold_pminsb_mask_commuted:
3885; CHECK:       # %bb.0:
3886; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3887; CHECK-NEXT:    #APP
3888; CHECK-NEXT:    nop
3889; CHECK-NEXT:    #NO_APP
3890; CHECK-NEXT:    kmovq %rdi, %k1
3891; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3892; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3893; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3894; CHECK-NEXT:    retq
3895  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3896  %2 = icmp slt <64 x i8> %a1, %a0
3897  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3898  %4 = bitcast i64 %mask to <64 x i1>
3899  ; load needed to keep the operation from being scheduled about the asm block
3900  %5 = load <64 x i8>, ptr %passthru
3901  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3902  ret <64 x i8> %6
3903}
3904
3905define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3906; CHECK-LABEL: stack_fold_pminsb_maskz:
3907; CHECK:       # %bb.0:
3908; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3909; CHECK-NEXT:    #APP
3910; CHECK-NEXT:    nop
3911; CHECK-NEXT:    #NO_APP
3912; CHECK-NEXT:    kmovq %rdi, %k1
3913; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3914; CHECK-NEXT:    retq
3915  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3916  %2 = icmp slt <64 x i8> %a0, %a1
3917  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3918  %4 = bitcast i64 %mask to <64 x i1>
3919  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3920  ret <64 x i8> %5
3921}
3922
3923define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3924; CHECK-LABEL: stack_fold_pminsb_maskz_commuted:
3925; CHECK:       # %bb.0:
3926; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3927; CHECK-NEXT:    #APP
3928; CHECK-NEXT:    nop
3929; CHECK-NEXT:    #NO_APP
3930; CHECK-NEXT:    kmovq %rdi, %k1
3931; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3932; CHECK-NEXT:    retq
3933  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3934  %2 = icmp slt <64 x i8> %a1, %a0
3935  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3936  %4 = bitcast i64 %mask to <64 x i1>
3937  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3938  ret <64 x i8> %5
3939}
3940
3941define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) {
3942; CHECK-LABEL: stack_fold_pminsd:
3943; CHECK:       # %bb.0:
3944; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3945; CHECK-NEXT:    #APP
3946; CHECK-NEXT:    nop
3947; CHECK-NEXT:    #NO_APP
3948; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3949; CHECK-NEXT:    retq
3950  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3951  %2 = icmp slt <16 x i32> %a0, %a1
3952  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3953  ret <16 x i32> %3
3954}
3955
3956define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3957; CHECK-LABEL: stack_fold_pminsd_commuted:
3958; CHECK:       # %bb.0:
3959; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3960; CHECK-NEXT:    #APP
3961; CHECK-NEXT:    nop
3962; CHECK-NEXT:    #NO_APP
3963; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3964; CHECK-NEXT:    retq
3965  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3966  %2 = icmp slt <16 x i32> %a1, %a0
3967  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3968  ret <16 x i32> %3
3969}
3970
3971define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3972; CHECK-LABEL: stack_fold_pminsd_mask:
3973; CHECK:       # %bb.0:
3974; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3975; CHECK-NEXT:    #APP
3976; CHECK-NEXT:    nop
3977; CHECK-NEXT:    #NO_APP
3978; CHECK-NEXT:    kmovd %edi, %k1
3979; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3980; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3981; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3982; CHECK-NEXT:    retq
3983  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3984  %2 = icmp slt <16 x i32> %a0, %a1
3985  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3986  %4 = bitcast i16 %mask to <16 x i1>
3987  ; load needed to keep the operation from being scheduled about the asm block
3988  %5 = load <16 x i32>, ptr %passthru
3989  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3990  ret <16 x i32> %6
3991}
3992
3993define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3994; CHECK-LABEL: stack_fold_pminsd_mask_commuted:
3995; CHECK:       # %bb.0:
3996; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3997; CHECK-NEXT:    #APP
3998; CHECK-NEXT:    nop
3999; CHECK-NEXT:    #NO_APP
4000; CHECK-NEXT:    kmovd %edi, %k1
4001; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4002; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4003; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4004; CHECK-NEXT:    retq
4005  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4006  %2 = icmp slt <16 x i32> %a1, %a0
4007  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4008  %4 = bitcast i16 %mask to <16 x i1>
4009  ; load needed to keep the operation from being scheduled about the asm block
4010  %5 = load <16 x i32>, ptr %passthru
4011  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4012  ret <16 x i32> %6
4013}
4014
4015define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4016; CHECK-LABEL: stack_fold_pminsd_maskz:
4017; CHECK:       # %bb.0:
4018; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4019; CHECK-NEXT:    #APP
4020; CHECK-NEXT:    nop
4021; CHECK-NEXT:    #NO_APP
4022; CHECK-NEXT:    kmovd %edi, %k1
4023; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4024; CHECK-NEXT:    retq
4025  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4026  %2 = icmp slt <16 x i32> %a0, %a1
4027  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4028  %4 = bitcast i16 %mask to <16 x i1>
4029  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4030  ret <16 x i32> %5
4031}
4032
4033define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4034; CHECK-LABEL: stack_fold_pminsd_maskz_commuted:
4035; CHECK:       # %bb.0:
4036; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4037; CHECK-NEXT:    #APP
4038; CHECK-NEXT:    nop
4039; CHECK-NEXT:    #NO_APP
4040; CHECK-NEXT:    kmovd %edi, %k1
4041; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4042; CHECK-NEXT:    retq
4043  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4044  %2 = icmp slt <16 x i32> %a1, %a0
4045  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4046  %4 = bitcast i16 %mask to <16 x i1>
4047  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4048  ret <16 x i32> %5
4049}
4050
4051define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) {
4052; CHECK-LABEL: stack_fold_pminsq:
4053; CHECK:       # %bb.0:
4054; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4055; CHECK-NEXT:    #APP
4056; CHECK-NEXT:    nop
4057; CHECK-NEXT:    #NO_APP
4058; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4059; CHECK-NEXT:    retq
4060  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4061  %2 = icmp slt <8 x i64> %a0, %a1
4062  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4063  ret <8 x i64> %3
4064}
4065
4066define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
4067; CHECK-LABEL: stack_fold_pminsq_commuted:
4068; CHECK:       # %bb.0:
4069; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4070; CHECK-NEXT:    #APP
4071; CHECK-NEXT:    nop
4072; CHECK-NEXT:    #NO_APP
4073; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4074; CHECK-NEXT:    retq
4075  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4076  %2 = icmp slt <8 x i64> %a1, %a0
4077  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4078  ret <8 x i64> %3
4079}
4080
4081define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
4082; CHECK-LABEL: stack_fold_pminsq_mask:
4083; CHECK:       # %bb.0:
4084; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4085; CHECK-NEXT:    #APP
4086; CHECK-NEXT:    nop
4087; CHECK-NEXT:    #NO_APP
4088; CHECK-NEXT:    kmovd %edi, %k1
4089; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4090; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4091; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4092; CHECK-NEXT:    retq
4093  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4094  %2 = icmp slt <8 x i64> %a0, %a1
4095  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4096  %4 = bitcast i8 %mask to <8 x i1>
4097  ; load needed to keep the operation from being scheduled about the asm block
4098  %5 = load <8 x i64>, ptr %passthru
4099  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4100  ret <8 x i64> %6
4101}
4102
4103define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
4104; CHECK-LABEL: stack_fold_pminsq_mask_commuted:
4105; CHECK:       # %bb.0:
4106; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4107; CHECK-NEXT:    #APP
4108; CHECK-NEXT:    nop
4109; CHECK-NEXT:    #NO_APP
4110; CHECK-NEXT:    kmovd %edi, %k1
4111; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4112; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4113; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4114; CHECK-NEXT:    retq
4115  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4116  %2 = icmp slt <8 x i64> %a1, %a0
4117  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4118  %4 = bitcast i8 %mask to <8 x i1>
4119  ; load needed to keep the operation from being scheduled about the asm block
4120  %5 = load <8 x i64>, ptr %passthru
4121  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4122  ret <8 x i64> %6
4123}
4124
4125define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4126; CHECK-LABEL: stack_fold_pminsq_maskz:
4127; CHECK:       # %bb.0:
4128; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4129; CHECK-NEXT:    #APP
4130; CHECK-NEXT:    nop
4131; CHECK-NEXT:    #NO_APP
4132; CHECK-NEXT:    kmovd %edi, %k1
4133; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4134; CHECK-NEXT:    retq
4135  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4136  %2 = icmp slt <8 x i64> %a0, %a1
4137  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4138  %4 = bitcast i8 %mask to <8 x i1>
4139  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4140  ret <8 x i64> %5
4141}
4142
4143define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4144; CHECK-LABEL: stack_fold_pminsq_maskz_commuted:
4145; CHECK:       # %bb.0:
4146; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4147; CHECK-NEXT:    #APP
4148; CHECK-NEXT:    nop
4149; CHECK-NEXT:    #NO_APP
4150; CHECK-NEXT:    kmovd %edi, %k1
4151; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4152; CHECK-NEXT:    retq
4153  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4154  %2 = icmp slt <8 x i64> %a1, %a0
4155  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4156  %4 = bitcast i8 %mask to <8 x i1>
4157  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4158  ret <8 x i64> %5
4159}
4160
4161define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) {
4162; CHECK-LABEL: stack_fold_pminsw:
4163; CHECK:       # %bb.0:
4164; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4165; CHECK-NEXT:    #APP
4166; CHECK-NEXT:    nop
4167; CHECK-NEXT:    #NO_APP
4168; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4169; CHECK-NEXT:    retq
4170  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4171  %2 = icmp slt <32 x i16> %a0, %a1
4172  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4173  ret <32 x i16> %3
4174}
4175
4176define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
4177; CHECK-LABEL: stack_fold_pminsw_commuted:
4178; CHECK:       # %bb.0:
4179; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4180; CHECK-NEXT:    #APP
4181; CHECK-NEXT:    nop
4182; CHECK-NEXT:    #NO_APP
4183; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4184; CHECK-NEXT:    retq
4185  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4186  %2 = icmp slt <32 x i16> %a1, %a0
4187  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4188  ret <32 x i16> %3
4189}
4190
4191define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
4192; CHECK-LABEL: stack_fold_pminsw_mask:
4193; CHECK:       # %bb.0:
4194; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4195; CHECK-NEXT:    #APP
4196; CHECK-NEXT:    nop
4197; CHECK-NEXT:    #NO_APP
4198; CHECK-NEXT:    kmovd %edi, %k1
4199; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4200; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4201; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4202; CHECK-NEXT:    retq
4203  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4204  %2 = icmp slt <32 x i16> %a0, %a1
4205  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4206  %4 = bitcast i32 %mask to <32 x i1>
4207  ; load needed to keep the operation from being scheduled about the asm block
4208  %5 = load <32 x i16>, ptr %passthru
4209  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4210  ret <32 x i16> %6
4211}
4212
4213define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
4214; CHECK-LABEL: stack_fold_pminsw_mask_commuted:
4215; CHECK:       # %bb.0:
4216; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4217; CHECK-NEXT:    #APP
4218; CHECK-NEXT:    nop
4219; CHECK-NEXT:    #NO_APP
4220; CHECK-NEXT:    kmovd %edi, %k1
4221; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4222; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4223; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4224; CHECK-NEXT:    retq
4225  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4226  %2 = icmp slt <32 x i16> %a1, %a0
4227  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4228  %4 = bitcast i32 %mask to <32 x i1>
4229  ; load needed to keep the operation from being scheduled about the asm block
4230  %5 = load <32 x i16>, ptr %passthru
4231  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4232  ret <32 x i16> %6
4233}
4234
4235define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4236; CHECK-LABEL: stack_fold_pminsw_maskz:
4237; CHECK:       # %bb.0:
4238; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4239; CHECK-NEXT:    #APP
4240; CHECK-NEXT:    nop
4241; CHECK-NEXT:    #NO_APP
4242; CHECK-NEXT:    kmovd %edi, %k1
4243; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4244; CHECK-NEXT:    retq
4245  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4246  %2 = icmp slt <32 x i16> %a0, %a1
4247  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4248  %4 = bitcast i32 %mask to <32 x i1>
4249  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4250  ret <32 x i16> %5
4251}
4252
4253define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4254; CHECK-LABEL: stack_fold_pminsw_maskz_commuted:
4255; CHECK:       # %bb.0:
4256; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4257; CHECK-NEXT:    #APP
4258; CHECK-NEXT:    nop
4259; CHECK-NEXT:    #NO_APP
4260; CHECK-NEXT:    kmovd %edi, %k1
4261; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4262; CHECK-NEXT:    retq
4263  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4264  %2 = icmp slt <32 x i16> %a1, %a0
4265  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4266  %4 = bitcast i32 %mask to <32 x i1>
4267  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4268  ret <32 x i16> %5
4269}
4270
4271define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) {
4272; CHECK-LABEL: stack_fold_pminub:
4273; CHECK:       # %bb.0:
4274; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4275; CHECK-NEXT:    #APP
4276; CHECK-NEXT:    nop
4277; CHECK-NEXT:    #NO_APP
4278; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4279; CHECK-NEXT:    retq
4280  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4281  %2 = icmp ult <64 x i8> %a0, %a1
4282  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4283  ret <64 x i8> %3
4284}
4285
4286define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
4287; CHECK-LABEL: stack_fold_pminub_commuted:
4288; CHECK:       # %bb.0:
4289; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4290; CHECK-NEXT:    #APP
4291; CHECK-NEXT:    nop
4292; CHECK-NEXT:    #NO_APP
4293; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4294; CHECK-NEXT:    retq
4295  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4296  %2 = icmp ult <64 x i8> %a1, %a0
4297  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4298  ret <64 x i8> %3
4299}
4300
4301define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
4302; CHECK-LABEL: stack_fold_pminub_mask:
4303; CHECK:       # %bb.0:
4304; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4305; CHECK-NEXT:    #APP
4306; CHECK-NEXT:    nop
4307; CHECK-NEXT:    #NO_APP
4308; CHECK-NEXT:    kmovq %rdi, %k1
4309; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4310; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4311; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4312; CHECK-NEXT:    retq
4313  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4314  %2 = icmp ult <64 x i8> %a0, %a1
4315  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4316  %4 = bitcast i64 %mask to <64 x i1>
4317  ; load needed to keep the operation from being scheduled about the asm block
4318  %5 = load <64 x i8>, ptr %passthru
4319  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
4320  ret <64 x i8> %6
4321}
4322
4323define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
4324; CHECK-LABEL: stack_fold_pminub_mask_commuted:
4325; CHECK:       # %bb.0:
4326; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4327; CHECK-NEXT:    #APP
4328; CHECK-NEXT:    nop
4329; CHECK-NEXT:    #NO_APP
4330; CHECK-NEXT:    kmovq %rdi, %k1
4331; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4332; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4333; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4334; CHECK-NEXT:    retq
4335  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4336  %2 = icmp ult <64 x i8> %a1, %a0
4337  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4338  %4 = bitcast i64 %mask to <64 x i1>
4339  ; load needed to keep the operation from being scheduled about the asm block
4340  %5 = load <64 x i8>, ptr %passthru
4341  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
4342  ret <64 x i8> %6
4343}
4344
4345define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
4346; CHECK-LABEL: stack_fold_pminub_maskz:
4347; CHECK:       # %bb.0:
4348; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4349; CHECK-NEXT:    #APP
4350; CHECK-NEXT:    nop
4351; CHECK-NEXT:    #NO_APP
4352; CHECK-NEXT:    kmovq %rdi, %k1
4353; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4354; CHECK-NEXT:    retq
4355  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4356  %2 = icmp ult <64 x i8> %a0, %a1
4357  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4358  %4 = bitcast i64 %mask to <64 x i1>
4359  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
4360  ret <64 x i8> %5
4361}
4362
4363define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
4364; CHECK-LABEL: stack_fold_pminub_maskz_commuted:
4365; CHECK:       # %bb.0:
4366; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4367; CHECK-NEXT:    #APP
4368; CHECK-NEXT:    nop
4369; CHECK-NEXT:    #NO_APP
4370; CHECK-NEXT:    kmovq %rdi, %k1
4371; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4372; CHECK-NEXT:    retq
4373  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4374  %2 = icmp ult <64 x i8> %a1, %a0
4375  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4376  %4 = bitcast i64 %mask to <64 x i1>
4377  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
4378  ret <64 x i8> %5
4379}
4380
4381define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) {
4382; CHECK-LABEL: stack_fold_pminud:
4383; CHECK:       # %bb.0:
4384; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4385; CHECK-NEXT:    #APP
4386; CHECK-NEXT:    nop
4387; CHECK-NEXT:    #NO_APP
4388; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4389; CHECK-NEXT:    retq
4390  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4391  %2 = icmp ult <16 x i32> %a0, %a1
4392  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4393  ret <16 x i32> %3
4394}
4395
4396define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
4397; CHECK-LABEL: stack_fold_pminud_commuted:
4398; CHECK:       # %bb.0:
4399; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4400; CHECK-NEXT:    #APP
4401; CHECK-NEXT:    nop
4402; CHECK-NEXT:    #NO_APP
4403; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4404; CHECK-NEXT:    retq
4405  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4406  %2 = icmp ult <16 x i32> %a1, %a0
4407  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4408  ret <16 x i32> %3
4409}
4410
4411define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
4412; CHECK-LABEL: stack_fold_pminud_mask:
4413; CHECK:       # %bb.0:
4414; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4415; CHECK-NEXT:    #APP
4416; CHECK-NEXT:    nop
4417; CHECK-NEXT:    #NO_APP
4418; CHECK-NEXT:    kmovd %edi, %k1
4419; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4420; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4421; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4422; CHECK-NEXT:    retq
4423  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4424  %2 = icmp ult <16 x i32> %a0, %a1
4425  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4426  %4 = bitcast i16 %mask to <16 x i1>
4427  ; load needed to keep the operation from being scheduled about the asm block
4428  %5 = load <16 x i32>, ptr %passthru
4429  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4430  ret <16 x i32> %6
4431}
4432
4433define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
4434; CHECK-LABEL: stack_fold_pminud_mask_commuted:
4435; CHECK:       # %bb.0:
4436; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4437; CHECK-NEXT:    #APP
4438; CHECK-NEXT:    nop
4439; CHECK-NEXT:    #NO_APP
4440; CHECK-NEXT:    kmovd %edi, %k1
4441; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4442; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4443; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4444; CHECK-NEXT:    retq
4445  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4446  %2 = icmp ult <16 x i32> %a1, %a0
4447  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4448  %4 = bitcast i16 %mask to <16 x i1>
4449  ; load needed to keep the operation from being scheduled about the asm block
4450  %5 = load <16 x i32>, ptr %passthru
4451  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4452  ret <16 x i32> %6
4453}
4454
4455define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4456; CHECK-LABEL: stack_fold_pminud_maskz:
4457; CHECK:       # %bb.0:
4458; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4459; CHECK-NEXT:    #APP
4460; CHECK-NEXT:    nop
4461; CHECK-NEXT:    #NO_APP
4462; CHECK-NEXT:    kmovd %edi, %k1
4463; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4464; CHECK-NEXT:    retq
4465  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4466  %2 = icmp ult <16 x i32> %a0, %a1
4467  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4468  %4 = bitcast i16 %mask to <16 x i1>
4469  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4470  ret <16 x i32> %5
4471}
4472
4473define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4474; CHECK-LABEL: stack_fold_pminud_maskz_commuted:
4475; CHECK:       # %bb.0:
4476; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4477; CHECK-NEXT:    #APP
4478; CHECK-NEXT:    nop
4479; CHECK-NEXT:    #NO_APP
4480; CHECK-NEXT:    kmovd %edi, %k1
4481; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4482; CHECK-NEXT:    retq
4483  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4484  %2 = icmp ult <16 x i32> %a1, %a0
4485  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4486  %4 = bitcast i16 %mask to <16 x i1>
4487  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4488  ret <16 x i32> %5
4489}
4490
4491define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) {
4492; CHECK-LABEL: stack_fold_pminuq:
4493; CHECK:       # %bb.0:
4494; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4495; CHECK-NEXT:    #APP
4496; CHECK-NEXT:    nop
4497; CHECK-NEXT:    #NO_APP
4498; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4499; CHECK-NEXT:    retq
4500  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4501  %2 = icmp ult <8 x i64> %a0, %a1
4502  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4503  ret <8 x i64> %3
4504}
4505
4506define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
4507; CHECK-LABEL: stack_fold_pminuq_commuted:
4508; CHECK:       # %bb.0:
4509; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4510; CHECK-NEXT:    #APP
4511; CHECK-NEXT:    nop
4512; CHECK-NEXT:    #NO_APP
4513; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4514; CHECK-NEXT:    retq
4515  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4516  %2 = icmp ult <8 x i64> %a1, %a0
4517  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4518  ret <8 x i64> %3
4519}
4520
4521define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
4522; CHECK-LABEL: stack_fold_pminuq_mask:
4523; CHECK:       # %bb.0:
4524; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4525; CHECK-NEXT:    #APP
4526; CHECK-NEXT:    nop
4527; CHECK-NEXT:    #NO_APP
4528; CHECK-NEXT:    kmovd %edi, %k1
4529; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4530; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4531; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4532; CHECK-NEXT:    retq
4533  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4534  %2 = icmp ult <8 x i64> %a0, %a1
4535  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4536  %4 = bitcast i8 %mask to <8 x i1>
4537  ; load needed to keep the operation from being scheduled about the asm block
4538  %5 = load <8 x i64>, ptr %passthru
4539  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4540  ret <8 x i64> %6
4541}
4542
4543define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
4544; CHECK-LABEL: stack_fold_pminuq_mask_commuted:
4545; CHECK:       # %bb.0:
4546; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4547; CHECK-NEXT:    #APP
4548; CHECK-NEXT:    nop
4549; CHECK-NEXT:    #NO_APP
4550; CHECK-NEXT:    kmovd %edi, %k1
4551; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4552; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4553; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4554; CHECK-NEXT:    retq
4555  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4556  %2 = icmp ult <8 x i64> %a1, %a0
4557  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4558  %4 = bitcast i8 %mask to <8 x i1>
4559  ; load needed to keep the operation from being scheduled about the asm block
4560  %5 = load <8 x i64>, ptr %passthru
4561  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4562  ret <8 x i64> %6
4563}
4564
4565define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4566; CHECK-LABEL: stack_fold_pminuq_maskz:
4567; CHECK:       # %bb.0:
4568; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4569; CHECK-NEXT:    #APP
4570; CHECK-NEXT:    nop
4571; CHECK-NEXT:    #NO_APP
4572; CHECK-NEXT:    kmovd %edi, %k1
4573; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4574; CHECK-NEXT:    retq
4575  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4576  %2 = icmp ult <8 x i64> %a0, %a1
4577  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4578  %4 = bitcast i8 %mask to <8 x i1>
4579  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4580  ret <8 x i64> %5
4581}
4582
4583define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4584; CHECK-LABEL: stack_fold_pminuq_maskz_commuted:
4585; CHECK:       # %bb.0:
4586; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4587; CHECK-NEXT:    #APP
4588; CHECK-NEXT:    nop
4589; CHECK-NEXT:    #NO_APP
4590; CHECK-NEXT:    kmovd %edi, %k1
4591; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4592; CHECK-NEXT:    retq
4593  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4594  %2 = icmp ult <8 x i64> %a1, %a0
4595  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4596  %4 = bitcast i8 %mask to <8 x i1>
4597  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4598  ret <8 x i64> %5
4599}
4600
4601define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) {
4602; CHECK-LABEL: stack_fold_pminuw:
4603; CHECK:       # %bb.0:
4604; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4605; CHECK-NEXT:    #APP
4606; CHECK-NEXT:    nop
4607; CHECK-NEXT:    #NO_APP
4608; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4609; CHECK-NEXT:    retq
4610  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4611  %2 = icmp ult <32 x i16> %a0, %a1
4612  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4613  ret <32 x i16> %3
4614}
4615
4616define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
4617; CHECK-LABEL: stack_fold_pminuw_commuted:
4618; CHECK:       # %bb.0:
4619; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4620; CHECK-NEXT:    #APP
4621; CHECK-NEXT:    nop
4622; CHECK-NEXT:    #NO_APP
4623; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4624; CHECK-NEXT:    retq
4625  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4626  %2 = icmp ult <32 x i16> %a1, %a0
4627  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4628  ret <32 x i16> %3
4629}
4630
4631define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
4632; CHECK-LABEL: stack_fold_pminuw_mask:
4633; CHECK:       # %bb.0:
4634; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4635; CHECK-NEXT:    #APP
4636; CHECK-NEXT:    nop
4637; CHECK-NEXT:    #NO_APP
4638; CHECK-NEXT:    kmovd %edi, %k1
4639; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4640; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4641; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4642; CHECK-NEXT:    retq
4643  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4644  %2 = icmp ult <32 x i16> %a0, %a1
4645  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4646  %4 = bitcast i32 %mask to <32 x i1>
4647  ; load needed to keep the operation from being scheduled about the asm block
4648  %5 = load <32 x i16>, ptr %passthru
4649  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4650  ret <32 x i16> %6
4651}
4652
4653define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
4654; CHECK-LABEL: stack_fold_pminuw_mask_commuted:
4655; CHECK:       # %bb.0:
4656; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4657; CHECK-NEXT:    #APP
4658; CHECK-NEXT:    nop
4659; CHECK-NEXT:    #NO_APP
4660; CHECK-NEXT:    kmovd %edi, %k1
4661; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4662; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4663; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4664; CHECK-NEXT:    retq
4665  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4666  %2 = icmp ult <32 x i16> %a1, %a0
4667  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4668  %4 = bitcast i32 %mask to <32 x i1>
4669  ; load needed to keep the operation from being scheduled about the asm block
4670  %5 = load <32 x i16>, ptr %passthru
4671  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4672  ret <32 x i16> %6
4673}
4674
4675define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4676; CHECK-LABEL: stack_fold_pminuw_maskz:
4677; CHECK:       # %bb.0:
4678; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4679; CHECK-NEXT:    #APP
4680; CHECK-NEXT:    nop
4681; CHECK-NEXT:    #NO_APP
4682; CHECK-NEXT:    kmovd %edi, %k1
4683; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4684; CHECK-NEXT:    retq
4685  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4686  %2 = icmp ult <32 x i16> %a0, %a1
4687  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4688  %4 = bitcast i32 %mask to <32 x i1>
4689  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4690  ret <32 x i16> %5
4691}
4692
4693define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4694; CHECK-LABEL: stack_fold_pminuw_maskz_commuted:
4695; CHECK:       # %bb.0:
4696; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4697; CHECK-NEXT:    #APP
4698; CHECK-NEXT:    nop
4699; CHECK-NEXT:    #NO_APP
4700; CHECK-NEXT:    kmovd %edi, %k1
4701; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4702; CHECK-NEXT:    retq
4703  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4704  %2 = icmp ult <32 x i16> %a1, %a0
4705  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4706  %4 = bitcast i32 %mask to <32 x i1>
4707  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4708  ret <32 x i16> %5
4709}
4710
4711define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) {
4712; CHECK-LABEL: stack_fold_vpmovdb:
4713; CHECK:       # %bb.0:
4714; CHECK-NEXT:    vpmovdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4715; CHECK-NEXT:    #APP
4716; CHECK-NEXT:    nop
4717; CHECK-NEXT:    #NO_APP
4718; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4719; CHECK-NEXT:    vzeroupper
4720; CHECK-NEXT:    retq
4721  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
4722  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4723  ret <16 x i8> %1
4724}
4725declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
4726
4727define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) {
4728; CHECK-LABEL: stack_fold_vpmovdw:
4729; CHECK:       # %bb.0:
4730; CHECK-NEXT:    vpmovdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4731; CHECK-NEXT:    #APP
4732; CHECK-NEXT:    nop
4733; CHECK-NEXT:    #NO_APP
4734; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4735; CHECK-NEXT:    retq
4736  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
4737  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4738  ret <16 x i16> %1
4739}
4740declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
4741
4742define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
4743; CHECK-LABEL: stack_fold_movq_load:
4744; CHECK:       # %bb.0:
4745; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4746; CHECK-NEXT:    #APP
4747; CHECK-NEXT:    nop
4748; CHECK-NEXT:    #NO_APP
4749; CHECK-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4750; CHECK-NEXT:    # xmm0 = mem[0],zero
4751; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
4752; CHECK-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
4753; CHECK-NEXT:    retq
4754  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4755  %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
4756  ; add forces execution domain
4757  %3 = add <2 x i64> %2, <i64 1, i64 1>
4758  ret <2 x i64> %3
4759}
4760
4761define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) {
4762; CHECK-LABEL: stack_fold_vpmovqd:
4763; CHECK:       # %bb.0:
4764; CHECK-NEXT:    vpmovqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4765; CHECK-NEXT:    #APP
4766; CHECK-NEXT:    nop
4767; CHECK-NEXT:    #NO_APP
4768; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4769; CHECK-NEXT:    retq
4770  %1 = trunc <8 x i64> %a0 to <8 x i32>
4771  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4772  ret <8 x i32> %1
4773}
4774declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
4775
4776define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) {
4777; CHECK-LABEL: stack_fold_vpmovqw:
4778; CHECK:       # %bb.0:
4779; CHECK-NEXT:    vpmovqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4780; CHECK-NEXT:    #APP
4781; CHECK-NEXT:    nop
4782; CHECK-NEXT:    #NO_APP
4783; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4784; CHECK-NEXT:    vzeroupper
4785; CHECK-NEXT:    retq
4786  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
4787  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4788  ret <8 x i16> %1
4789}
4790declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
4791
4792define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) {
4793; CHECK-LABEL: stack_fold_vpmovwb:
4794; CHECK:       # %bb.0:
4795; CHECK-NEXT:    vpmovwb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4796; CHECK-NEXT:    #APP
4797; CHECK-NEXT:    nop
4798; CHECK-NEXT:    #NO_APP
4799; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4800; CHECK-NEXT:    retq
4801  %1 = trunc <32 x i16> %a0 to <32 x i8>
4802  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4803  ret <32 x i8> %1
4804}
4805declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
4806
4807define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) {
4808; CHECK-LABEL: stack_fold_vpmovsdb:
4809; CHECK:       # %bb.0:
4810; CHECK-NEXT:    vpmovsdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4811; CHECK-NEXT:    #APP
4812; CHECK-NEXT:    nop
4813; CHECK-NEXT:    #NO_APP
4814; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4815; CHECK-NEXT:    vzeroupper
4816; CHECK-NEXT:    retq
4817  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
4818  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4819  ret <16 x i8> %1
4820}
4821declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
4822
4823define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) {
4824; CHECK-LABEL: stack_fold_vpmovsdw:
4825; CHECK:       # %bb.0:
4826; CHECK-NEXT:    vpmovsdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4827; CHECK-NEXT:    #APP
4828; CHECK-NEXT:    nop
4829; CHECK-NEXT:    #NO_APP
4830; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4831; CHECK-NEXT:    retq
4832  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
4833  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4834  ret <16 x i16> %1
4835}
4836declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
4837
4838define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) {
4839; CHECK-LABEL: stack_fold_vpmovsqd:
4840; CHECK:       # %bb.0:
4841; CHECK-NEXT:    vpmovsqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4842; CHECK-NEXT:    #APP
4843; CHECK-NEXT:    nop
4844; CHECK-NEXT:    #NO_APP
4845; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4846; CHECK-NEXT:    retq
4847  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
4848  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4849  ret <8 x i32> %1
4850}
4851declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
4852
4853define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) {
4854; CHECK-LABEL: stack_fold_vpmovsqw:
4855; CHECK:       # %bb.0:
4856; CHECK-NEXT:    vpmovsqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4857; CHECK-NEXT:    #APP
4858; CHECK-NEXT:    nop
4859; CHECK-NEXT:    #NO_APP
4860; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4861; CHECK-NEXT:    vzeroupper
4862; CHECK-NEXT:    retq
4863  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
4864  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4865  ret <8 x i16> %1
4866}
4867declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
4868
4869define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) {
4870; CHECK-LABEL: stack_fold_vpmovswb:
4871; CHECK:       # %bb.0:
4872; CHECK-NEXT:    vpmovswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4873; CHECK-NEXT:    #APP
4874; CHECK-NEXT:    nop
4875; CHECK-NEXT:    #NO_APP
4876; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4877; CHECK-NEXT:    retq
4878  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
4879  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4880  ret <32 x i8> %1
4881}
4882declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
4883
4884define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) {
4885; CHECK-LABEL: stack_fold_pmovsxbd_zmm:
4886; CHECK:       # %bb.0:
4887; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4888; CHECK-NEXT:    #APP
4889; CHECK-NEXT:    nop
4890; CHECK-NEXT:    #NO_APP
4891; CHECK-NEXT:    vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4892; CHECK-NEXT:    retq
4893  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4894  %2 = sext <16 x i8> %a0 to <16 x i32>
4895  ret <16 x i32> %2
4896}
4897
4898define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) {
4899; CHECK-LABEL: stack_fold_pmovsxbq_zmm:
4900; CHECK:       # %bb.0:
4901; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4902; CHECK-NEXT:    #APP
4903; CHECK-NEXT:    nop
4904; CHECK-NEXT:    #NO_APP
4905; CHECK-NEXT:    vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4906; CHECK-NEXT:    retq
4907  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4908  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4909  %3 = sext <8 x i8> %2 to <8 x i64>
4910  ret <8 x i64> %3
4911}
4912
4913define <32 x i16> @stack_fold_pmovsxbw_zmm(<32 x i8> %a0) {
4914; CHECK-LABEL: stack_fold_pmovsxbw_zmm:
4915; CHECK:       # %bb.0:
4916; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4917; CHECK-NEXT:    #APP
4918; CHECK-NEXT:    nop
4919; CHECK-NEXT:    #NO_APP
4920; CHECK-NEXT:    vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4921; CHECK-NEXT:    retq
4922  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4923  %2 = sext <32 x i8> %a0 to <32 x i16>
4924  ret <32 x i16> %2
4925}
4926
4927define <8 x i64> @stack_fold_pmovsxdq_zmm(<8 x i32> %a0) {
4928; CHECK-LABEL: stack_fold_pmovsxdq_zmm:
4929; CHECK:       # %bb.0:
4930; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4931; CHECK-NEXT:    #APP
4932; CHECK-NEXT:    nop
4933; CHECK-NEXT:    #NO_APP
4934; CHECK-NEXT:    vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4935; CHECK-NEXT:    retq
4936  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4937  %2 = sext <8 x i32> %a0 to <8 x i64>
4938  ret <8 x i64> %2
4939}
4940
4941define <16 x i32> @stack_fold_pmovsxwd_zmm(<16 x i16> %a0) {
4942; CHECK-LABEL: stack_fold_pmovsxwd_zmm:
4943; CHECK:       # %bb.0:
4944; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4945; CHECK-NEXT:    #APP
4946; CHECK-NEXT:    nop
4947; CHECK-NEXT:    #NO_APP
4948; CHECK-NEXT:    vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4949; CHECK-NEXT:    retq
4950  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4951  %2 = sext <16 x i16> %a0 to <16 x i32>
4952  ret <16 x i32> %2
4953}
4954
4955define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) {
4956; CHECK-LABEL: stack_fold_pmovsxwq_zmm:
4957; CHECK:       # %bb.0:
4958; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4959; CHECK-NEXT:    #APP
4960; CHECK-NEXT:    nop
4961; CHECK-NEXT:    #NO_APP
4962; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4963; CHECK-NEXT:    retq
4964  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4965  %2 = sext <8 x i16> %a0 to <8 x i64>
4966  ret <8 x i64> %2
4967}
4968
4969define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
4970; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm:
4971; CHECK:       # %bb.0:
4972; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4973; CHECK-NEXT:    #APP
4974; CHECK-NEXT:    nop
4975; CHECK-NEXT:    #NO_APP
4976; CHECK-NEXT:    kmovd %edi, %k1
4977; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
4978; CHECK-NEXT:    retq
4979  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4980  %2 = sext <8 x i16> %a0 to <8 x i64>
4981  %3 = bitcast i8 %mask to <8 x i1>
4982  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru
4983  ret <8 x i64> %4
4984}
4985
4986define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
4987; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm:
4988; CHECK:       # %bb.0:
4989; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4990; CHECK-NEXT:    #APP
4991; CHECK-NEXT:    nop
4992; CHECK-NEXT:    #NO_APP
4993; CHECK-NEXT:    kmovd %edi, %k1
4994; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
4995; CHECK-NEXT:    retq
4996  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4997  %2 = sext <8 x i16> %a0 to <8 x i64>
4998  %3 = bitcast i8 %mask to <8 x i1>
4999  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5000  ret <8 x i64> %4
5001}
5002
5003define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) {
5004; CHECK-LABEL: stack_fold_vpmovusdb:
5005; CHECK:       # %bb.0:
5006; CHECK-NEXT:    vpmovusdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
5007; CHECK-NEXT:    #APP
5008; CHECK-NEXT:    nop
5009; CHECK-NEXT:    #NO_APP
5010; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5011; CHECK-NEXT:    vzeroupper
5012; CHECK-NEXT:    retq
5013  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
5014  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5015  ret <16 x i8> %1
5016}
5017declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
5018
5019define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) {
5020; CHECK-LABEL: stack_fold_vpmovusdw:
5021; CHECK:       # %bb.0:
5022; CHECK-NEXT:    vpmovusdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5023; CHECK-NEXT:    #APP
5024; CHECK-NEXT:    nop
5025; CHECK-NEXT:    #NO_APP
5026; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5027; CHECK-NEXT:    retq
5028  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
5029  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5030  ret <16 x i16> %1
5031}
5032declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
5033
5034define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) {
5035; CHECK-LABEL: stack_fold_vpmovusqd:
5036; CHECK:       # %bb.0:
5037; CHECK-NEXT:    vpmovusqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5038; CHECK-NEXT:    #APP
5039; CHECK-NEXT:    nop
5040; CHECK-NEXT:    #NO_APP
5041; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5042; CHECK-NEXT:    retq
5043  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
5044  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5045  ret <8 x i32> %1
5046}
5047declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
5048
5049define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) {
5050; CHECK-LABEL: stack_fold_vpmovusqw:
5051; CHECK:       # %bb.0:
5052; CHECK-NEXT:    vpmovusqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
5053; CHECK-NEXT:    #APP
5054; CHECK-NEXT:    nop
5055; CHECK-NEXT:    #NO_APP
5056; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5057; CHECK-NEXT:    vzeroupper
5058; CHECK-NEXT:    retq
5059  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
5060  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5061  ret <8 x i16> %1
5062}
5063declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
5064
5065define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) {
5066; CHECK-LABEL: stack_fold_vpmovuswb:
5067; CHECK:       # %bb.0:
5068; CHECK-NEXT:    vpmovuswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5069; CHECK-NEXT:    #APP
5070; CHECK-NEXT:    nop
5071; CHECK-NEXT:    #NO_APP
5072; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5073; CHECK-NEXT:    retq
5074  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
5075  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5076  ret <32 x i8> %1
5077}
5078declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
5079
5080define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) {
5081; CHECK-LABEL: stack_fold_pmovzxbd_zmm:
5082; CHECK:       # %bb.0:
5083; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5084; CHECK-NEXT:    #APP
5085; CHECK-NEXT:    nop
5086; CHECK-NEXT:    #NO_APP
5087; CHECK-NEXT:    vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5088; CHECK-NEXT:    # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
5089; CHECK-NEXT:    retq
5090  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5091  %2 = zext <16 x i8> %a0 to <16 x i32>
5092  ret <16 x i32> %2
5093}
5094
5095define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) {
5096; CHECK-LABEL: stack_fold_pmovzxbq_zmm:
5097; CHECK:       # %bb.0:
5098; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5099; CHECK-NEXT:    #APP
5100; CHECK-NEXT:    nop
5101; CHECK-NEXT:    #NO_APP
5102; CHECK-NEXT:    vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5103; CHECK-NEXT:    # zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
5104; CHECK-NEXT:    retq
5105  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5106  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5107  %3 = zext <8 x i8> %2 to <8 x i64>
5108  ret <8 x i64> %3
5109}
5110
5111define <32 x i16> @stack_fold_pmovzxbw_zmm(<32 x i8> %a0) {
5112; CHECK-LABEL: stack_fold_pmovzxbw_zmm:
5113; CHECK:       # %bb.0:
5114; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5115; CHECK-NEXT:    #APP
5116; CHECK-NEXT:    nop
5117; CHECK-NEXT:    #NO_APP
5118; CHECK-NEXT:    vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5119; CHECK-NEXT:    # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
5120; CHECK-NEXT:    retq
5121  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5122  %2 = zext <32 x i8> %a0 to <32 x i16>
5123  ret <32 x i16> %2
5124}
5125
5126define <8 x i64> @stack_fold_pmovzxdq_zmm(<8 x i32> %a0) {
5127; CHECK-LABEL: stack_fold_pmovzxdq_zmm:
5128; CHECK:       # %bb.0:
5129; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5130; CHECK-NEXT:    #APP
5131; CHECK-NEXT:    nop
5132; CHECK-NEXT:    #NO_APP
5133; CHECK-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5134; CHECK-NEXT:    # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
5135; CHECK-NEXT:    retq
5136  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5137  %2 = zext <8 x i32> %a0 to <8 x i64>
5138  ret <8 x i64> %2
5139}
5140
5141define <16 x i32> @stack_fold_pmovzxwd_zmm(<16 x i16> %a0) {
5142; CHECK-LABEL: stack_fold_pmovzxwd_zmm:
5143; CHECK:       # %bb.0:
5144; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5145; CHECK-NEXT:    #APP
5146; CHECK-NEXT:    nop
5147; CHECK-NEXT:    #NO_APP
5148; CHECK-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5149; CHECK-NEXT:    # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
5150; CHECK-NEXT:    retq
5151  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5152  %2 = zext <16 x i16> %a0 to <16 x i32>
5153  ret <16 x i32> %2
5154}
5155
5156define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) {
5157; CHECK-LABEL: stack_fold_pmovzxwq_zmm:
5158; CHECK:       # %bb.0:
5159; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5160; CHECK-NEXT:    #APP
5161; CHECK-NEXT:    nop
5162; CHECK-NEXT:    #NO_APP
5163; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5164; CHECK-NEXT:    # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5165; CHECK-NEXT:    retq
5166  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5167  %2 = zext <8 x i16> %a0 to <8 x i64>
5168  ret <8 x i64> %2
5169}
5170
5171define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
5172; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm:
5173; CHECK:       # %bb.0:
5174; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5175; CHECK-NEXT:    #APP
5176; CHECK-NEXT:    nop
5177; CHECK-NEXT:    #NO_APP
5178; CHECK-NEXT:    kmovd %edi, %k1
5179; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
5180; CHECK-NEXT:    # zmm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5181; CHECK-NEXT:    retq
5182  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5183  %2 = zext <8 x i16> %a0 to <8 x i64>
5184  %3 = bitcast i8 %mask to <8 x i1>
5185  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru
5186  ret <8 x i64> %4
5187}
5188
5189define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
5190; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm:
5191; CHECK:       # %bb.0:
5192; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5193; CHECK-NEXT:    #APP
5194; CHECK-NEXT:    nop
5195; CHECK-NEXT:    #NO_APP
5196; CHECK-NEXT:    kmovd %edi, %k1
5197; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
5198; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5199; CHECK-NEXT:    retq
5200  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5201  %2 = zext <8 x i16> %a0 to <8 x i64>
5202  %3 = bitcast i8 %mask to <8 x i1>
5203  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5204  ret <8 x i64> %4
5205}
5206
5207define <16 x i32> @stack_fold_pmulld(<16 x i32> %a0, <16 x i32> %a1) {
5208; CHECK-LABEL: stack_fold_pmulld:
5209; CHECK:       # %bb.0:
5210; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5211; CHECK-NEXT:    #APP
5212; CHECK-NEXT:    nop
5213; CHECK-NEXT:    #NO_APP
5214; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5215; CHECK-NEXT:    retq
5216  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5217  %2 = mul <16 x i32> %a0, %a1
5218  ret <16 x i32> %2
5219}
5220
5221define <16 x i32> @stack_fold_pmulld_commuted(<16 x i32> %a0, <16 x i32> %a1) {
5222; CHECK-LABEL: stack_fold_pmulld_commuted:
5223; CHECK:       # %bb.0:
5224; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5225; CHECK-NEXT:    #APP
5226; CHECK-NEXT:    nop
5227; CHECK-NEXT:    #NO_APP
5228; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5229; CHECK-NEXT:    retq
5230  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5231  %2 = mul <16 x i32> %a1, %a0
5232  ret <16 x i32> %2
5233}
5234
5235define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
5236; CHECK-LABEL: stack_fold_pmulld_mask:
5237; CHECK:       # %bb.0:
5238; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5239; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5240; CHECK-NEXT:    #APP
5241; CHECK-NEXT:    nop
5242; CHECK-NEXT:    #NO_APP
5243; CHECK-NEXT:    kmovd %esi, %k1
5244; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5245; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5246; CHECK-NEXT:    retq
5247  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5248  %2 = mul <16 x i32> %a0, %a1
5249  %3 = bitcast i16 %mask to <16 x i1>
5250  ; load needed to keep the operation from being scheduled about the asm block
5251  %4 = load <16 x i32>, ptr %a2
5252  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5253  ret <16 x i32> %5
5254}
5255
5256define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
5257; CHECK-LABEL: stack_fold_pmulld_mask_commuted:
5258; CHECK:       # %bb.0:
5259; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5260; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5261; CHECK-NEXT:    #APP
5262; CHECK-NEXT:    nop
5263; CHECK-NEXT:    #NO_APP
5264; CHECK-NEXT:    kmovd %esi, %k1
5265; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5266; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5267; CHECK-NEXT:    retq
5268  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5269  %2 = mul <16 x i32> %a1, %a0
5270  %3 = bitcast i16 %mask to <16 x i1>
5271  ; load needed to keep the operation from being scheduled about the asm block
5272  %4 = load <16 x i32>, ptr %a2
5273  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5274  ret <16 x i32> %5
5275}
5276
5277define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5278; CHECK-LABEL: stack_fold_pmulld_maskz:
5279; CHECK:       # %bb.0:
5280; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5281; CHECK-NEXT:    #APP
5282; CHECK-NEXT:    nop
5283; CHECK-NEXT:    #NO_APP
5284; CHECK-NEXT:    kmovd %edi, %k1
5285; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5286; CHECK-NEXT:    retq
5287  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5288  %2 = mul <16 x i32> %a0, %a1
5289  %3 = bitcast i16 %mask to <16 x i1>
5290  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5291  ret <16 x i32> %4
5292}
5293
5294define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5295; CHECK-LABEL: stack_fold_pmulld_maskz_commuted:
5296; CHECK:       # %bb.0:
5297; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5298; CHECK-NEXT:    #APP
5299; CHECK-NEXT:    nop
5300; CHECK-NEXT:    #NO_APP
5301; CHECK-NEXT:    kmovd %edi, %k1
5302; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5303; CHECK-NEXT:    retq
5304  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5305  %2 = mul <16 x i32> %a1, %a0
5306  %3 = bitcast i16 %mask to <16 x i1>
5307  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5308  ret <16 x i32> %4
5309}
5310
5311define <8 x i64> @stack_fold_pmullq(<8 x i64> %a0, <8 x i64> %a1) {
5312; CHECK-LABEL: stack_fold_pmullq:
5313; CHECK:       # %bb.0:
5314; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5315; CHECK-NEXT:    #APP
5316; CHECK-NEXT:    nop
5317; CHECK-NEXT:    #NO_APP
5318; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5319; CHECK-NEXT:    retq
5320  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5321  %2 = mul <8 x i64> %a0, %a1
5322  ret <8 x i64> %2
5323}
5324
5325define <8 x i64> @stack_fold_pmullq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5326; CHECK-LABEL: stack_fold_pmullq_commuted:
5327; CHECK:       # %bb.0:
5328; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5329; CHECK-NEXT:    #APP
5330; CHECK-NEXT:    nop
5331; CHECK-NEXT:    #NO_APP
5332; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5333; CHECK-NEXT:    retq
5334  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5335  %2 = mul <8 x i64> %a1, %a0
5336  ret <8 x i64> %2
5337}
5338
5339define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5340; CHECK-LABEL: stack_fold_pmullq_mask:
5341; CHECK:       # %bb.0:
5342; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5343; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5344; CHECK-NEXT:    #APP
5345; CHECK-NEXT:    nop
5346; CHECK-NEXT:    #NO_APP
5347; CHECK-NEXT:    kmovd %esi, %k1
5348; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5349; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5350; CHECK-NEXT:    retq
5351  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5352  %2 = mul <8 x i64> %a0, %a1
5353  %3 = bitcast i8 %mask to <8 x i1>
5354  ; load needed to keep the operation from being scheduled about the asm block
5355  %4 = load <8 x i64>, ptr %a2
5356  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5357  ret <8 x i64> %5
5358}
5359
5360define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5361; CHECK-LABEL: stack_fold_pmullq_mask_commuted:
5362; CHECK:       # %bb.0:
5363; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5364; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5365; CHECK-NEXT:    #APP
5366; CHECK-NEXT:    nop
5367; CHECK-NEXT:    #NO_APP
5368; CHECK-NEXT:    kmovd %esi, %k1
5369; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5370; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5371; CHECK-NEXT:    retq
5372  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5373  %2 = mul <8 x i64> %a1, %a0
5374  %3 = bitcast i8 %mask to <8 x i1>
5375  ; load needed to keep the operation from being scheduled about the asm block
5376  %4 = load <8 x i64>, ptr %a2
5377  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5378  ret <8 x i64> %5
5379}
5380
5381define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5382; CHECK-LABEL: stack_fold_pmullq_maskz:
5383; CHECK:       # %bb.0:
5384; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5385; CHECK-NEXT:    #APP
5386; CHECK-NEXT:    nop
5387; CHECK-NEXT:    #NO_APP
5388; CHECK-NEXT:    kmovd %edi, %k1
5389; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5390; CHECK-NEXT:    retq
5391  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5392  %2 = mul <8 x i64> %a0, %a1
5393  %3 = bitcast i8 %mask to <8 x i1>
5394  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5395  ret <8 x i64> %4
5396}
5397
5398define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5399; CHECK-LABEL: stack_fold_pmullq_maskz_commuted:
5400; CHECK:       # %bb.0:
5401; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5402; CHECK-NEXT:    #APP
5403; CHECK-NEXT:    nop
5404; CHECK-NEXT:    #NO_APP
5405; CHECK-NEXT:    kmovd %edi, %k1
5406; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5407; CHECK-NEXT:    retq
5408  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5409  %2 = mul <8 x i64> %a1, %a0
5410  %3 = bitcast i8 %mask to <8 x i1>
5411  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5412  ret <8 x i64> %4
5413}
5414
5415define <32 x i16> @stack_fold_pmullw(<32 x i16> %a0, <32 x i16> %a1) {
5416; CHECK-LABEL: stack_fold_pmullw:
5417; CHECK:       # %bb.0:
5418; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5419; CHECK-NEXT:    #APP
5420; CHECK-NEXT:    nop
5421; CHECK-NEXT:    #NO_APP
5422; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5423; CHECK-NEXT:    retq
5424  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5425  %2 = mul <32 x i16> %a0, %a1
5426  ret <32 x i16> %2
5427}
5428
5429define <32 x i16> @stack_fold_pmullw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
5430; CHECK-LABEL: stack_fold_pmullw_commuted:
5431; CHECK:       # %bb.0:
5432; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5433; CHECK-NEXT:    #APP
5434; CHECK-NEXT:    nop
5435; CHECK-NEXT:    #NO_APP
5436; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5437; CHECK-NEXT:    retq
5438  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5439  %2 = mul <32 x i16> %a1, %a0
5440  ret <32 x i16> %2
5441}
5442
5443define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
5444; CHECK-LABEL: stack_fold_pmullw_mask:
5445; CHECK:       # %bb.0:
5446; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5447; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5448; CHECK-NEXT:    #APP
5449; CHECK-NEXT:    nop
5450; CHECK-NEXT:    #NO_APP
5451; CHECK-NEXT:    kmovd %esi, %k1
5452; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5453; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5454; CHECK-NEXT:    retq
5455  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5456  %2 = mul <32 x i16> %a0, %a1
5457  %3 = bitcast i32 %mask to <32 x i1>
5458  ; load needed to keep the operation from being scheduled about the asm block
5459  %4 = load <32 x i16>, ptr %a2
5460  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
5461  ret <32 x i16> %5
5462}
5463
5464define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
5465; CHECK-LABEL: stack_fold_pmullw_mask_commuted:
5466; CHECK:       # %bb.0:
5467; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5468; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5469; CHECK-NEXT:    #APP
5470; CHECK-NEXT:    nop
5471; CHECK-NEXT:    #NO_APP
5472; CHECK-NEXT:    kmovd %esi, %k1
5473; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5474; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5475; CHECK-NEXT:    retq
5476  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5477  %2 = mul <32 x i16> %a1, %a0
5478  %3 = bitcast i32 %mask to <32 x i1>
5479  ; load needed to keep the operation from being scheduled about the asm block
5480  %4 = load <32 x i16>, ptr %a2
5481  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
5482  ret <32 x i16> %5
5483}
5484
5485define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
5486; CHECK-LABEL: stack_fold_pmullw_maskz:
5487; CHECK:       # %bb.0:
5488; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5489; CHECK-NEXT:    #APP
5490; CHECK-NEXT:    nop
5491; CHECK-NEXT:    #NO_APP
5492; CHECK-NEXT:    kmovd %edi, %k1
5493; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5494; CHECK-NEXT:    retq
5495  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5496  %2 = mul <32 x i16> %a0, %a1
5497  %3 = bitcast i32 %mask to <32 x i1>
5498  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
5499  ret <32 x i16> %4
5500}
5501
5502define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
5503; CHECK-LABEL: stack_fold_pmullw_maskz_commuted:
5504; CHECK:       # %bb.0:
5505; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5506; CHECK-NEXT:    #APP
5507; CHECK-NEXT:    nop
5508; CHECK-NEXT:    #NO_APP
5509; CHECK-NEXT:    kmovd %edi, %k1
5510; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5511; CHECK-NEXT:    retq
5512  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5513  %2 = mul <32 x i16> %a1, %a0
5514  %3 = bitcast i32 %mask to <32 x i1>
5515  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
5516  ret <32 x i16> %4
5517}
5518
5519define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) {
5520; CHECK-LABEL: stack_fold_pmuldq:
5521; CHECK:       # %bb.0:
5522; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5523; CHECK-NEXT:    #APP
5524; CHECK-NEXT:    nop
5525; CHECK-NEXT:    #NO_APP
5526; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5527; CHECK-NEXT:    retq
5528  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5529  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5530  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5531  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5532  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5533  %6 = mul <8 x i64> %3, %5
5534  ret <8 x i64> %6
5535}
5536
5537define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5538; CHECK-LABEL: stack_fold_pmuldq_commuted:
5539; CHECK:       # %bb.0:
5540; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5541; CHECK-NEXT:    #APP
5542; CHECK-NEXT:    nop
5543; CHECK-NEXT:    #NO_APP
5544; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5545; CHECK-NEXT:    retq
5546  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5547  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5548  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5549  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5550  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5551  %6 = mul <8 x i64> %5, %3
5552  ret <8 x i64> %6
5553}
5554
5555define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5556; CHECK-LABEL: stack_fold_pmuldq_mask:
5557; CHECK:       # %bb.0:
5558; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5559; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5560; CHECK-NEXT:    #APP
5561; CHECK-NEXT:    nop
5562; CHECK-NEXT:    #NO_APP
5563; CHECK-NEXT:    kmovd %esi, %k1
5564; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5565; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5566; CHECK-NEXT:    retq
5567  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5568  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5569  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5570  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5571  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5572  %6 = mul <8 x i64> %3, %5
5573  %7 = bitcast i8 %mask to <8 x i1>
5574  ; load needed to keep the operation from being scheduled about the asm block
5575  %8 = load <8 x i64>, ptr %a2
5576  %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8
5577  ret <8 x i64> %9
5578}
5579
5580define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5581; CHECK-LABEL: stack_fold_pmuldq_mask_commuted:
5582; CHECK:       # %bb.0:
5583; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5584; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5585; CHECK-NEXT:    #APP
5586; CHECK-NEXT:    nop
5587; CHECK-NEXT:    #NO_APP
5588; CHECK-NEXT:    kmovd %esi, %k1
5589; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5590; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5591; CHECK-NEXT:    retq
5592  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5593  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5594  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5595  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5596  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5597  %6 = mul <8 x i64> %5, %3
5598  %7 = bitcast i8 %mask to <8 x i1>
5599  ; load needed to keep the operation from being scheduled about the asm block
5600  %8 = load <8 x i64>, ptr %a2
5601  %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8
5602  ret <8 x i64> %9
5603}
5604
5605define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5606; CHECK-LABEL: stack_fold_pmuldq_maskz:
5607; CHECK:       # %bb.0:
5608; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5609; CHECK-NEXT:    #APP
5610; CHECK-NEXT:    nop
5611; CHECK-NEXT:    #NO_APP
5612; CHECK-NEXT:    kmovd %edi, %k1
5613; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5614; CHECK-NEXT:    retq
5615  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5616  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5617  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5618  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5619  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5620  %6 = mul <8 x i64> %3, %5
5621  %7 = bitcast i8 %mask to <8 x i1>
5622  %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer
5623  ret <8 x i64> %8
5624}
5625
5626define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5627; CHECK-LABEL: stack_fold_pmuldq_maskz_commuted:
5628; CHECK:       # %bb.0:
5629; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5630; CHECK-NEXT:    #APP
5631; CHECK-NEXT:    nop
5632; CHECK-NEXT:    #NO_APP
5633; CHECK-NEXT:    kmovd %edi, %k1
5634; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5635; CHECK-NEXT:    retq
5636  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5637  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5638  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5639  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5640  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5641  %6 = mul <8 x i64> %5, %3
5642  %7 = bitcast i8 %mask to <8 x i1>
5643  %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer
5644  ret <8 x i64> %8
5645}
5646
5647
5648
5649
5650define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) {
5651; CHECK-LABEL: stack_fold_pmuludq:
5652; CHECK:       # %bb.0:
5653; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5654; CHECK-NEXT:    #APP
5655; CHECK-NEXT:    nop
5656; CHECK-NEXT:    #NO_APP
5657; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5658; CHECK-NEXT:    retq
5659  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5660  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5661  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5662  %4 = mul <8 x i64> %2, %3
5663  ret <8 x i64> %4
5664}
5665
5666define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5667; CHECK-LABEL: stack_fold_pmuludq_commuted:
5668; CHECK:       # %bb.0:
5669; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5670; CHECK-NEXT:    #APP
5671; CHECK-NEXT:    nop
5672; CHECK-NEXT:    #NO_APP
5673; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5674; CHECK-NEXT:    retq
5675  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5676  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5677  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5678  %4 = mul <8 x i64> %3, %2
5679  ret <8 x i64> %4
5680}
5681
5682define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5683; CHECK-LABEL: stack_fold_pmuludq_mask:
5684; CHECK:       # %bb.0:
5685; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5686; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5687; CHECK-NEXT:    #APP
5688; CHECK-NEXT:    nop
5689; CHECK-NEXT:    #NO_APP
5690; CHECK-NEXT:    kmovd %esi, %k1
5691; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5692; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5693; CHECK-NEXT:    retq
5694  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5695  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5696  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5697  %4 = mul <8 x i64> %2, %3
5698  %5 = bitcast i8 %mask to <8 x i1>
5699  ; load needed to keep the operation from being scheduled about the asm block
5700  %6 = load <8 x i64>, ptr %a2
5701  %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6
5702  ret <8 x i64> %7
5703}
5704
5705define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5706; CHECK-LABEL: stack_fold_pmuludq_mask_commuted:
5707; CHECK:       # %bb.0:
5708; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5709; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5710; CHECK-NEXT:    #APP
5711; CHECK-NEXT:    nop
5712; CHECK-NEXT:    #NO_APP
5713; CHECK-NEXT:    kmovd %esi, %k1
5714; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5715; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5716; CHECK-NEXT:    retq
5717  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5718  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5719  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5720  %4 = mul <8 x i64> %3, %2
5721  %5 = bitcast i8 %mask to <8 x i1>
5722  ; load needed to keep the operation from being scheduled about the asm block
5723  %6 = load <8 x i64>, ptr %a2
5724  %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6
5725  ret <8 x i64> %7
5726}
5727
5728define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5729; CHECK-LABEL: stack_fold_pmuludq_maskz:
5730; CHECK:       # %bb.0:
5731; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5732; CHECK-NEXT:    #APP
5733; CHECK-NEXT:    nop
5734; CHECK-NEXT:    #NO_APP
5735; CHECK-NEXT:    kmovd %edi, %k1
5736; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5737; CHECK-NEXT:    retq
5738  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5739  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5740  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5741  %4 = mul <8 x i64> %2, %3
5742  %5 = bitcast i8 %mask to <8 x i1>
5743  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5744  ret <8 x i64> %6
5745}
5746
5747define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5748; CHECK-LABEL: stack_fold_pmuludq_maskz_commuted:
5749; CHECK:       # %bb.0:
5750; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5751; CHECK-NEXT:    #APP
5752; CHECK-NEXT:    nop
5753; CHECK-NEXT:    #NO_APP
5754; CHECK-NEXT:    kmovd %edi, %k1
5755; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5756; CHECK-NEXT:    retq
5757  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5758  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5759  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5760  %4 = mul <8 x i64> %3, %2
5761  %5 = bitcast i8 %mask to <8 x i1>
5762  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5763  ret <8 x i64> %6
5764}
5765
5766define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) {
5767; CHECK-LABEL: stack_fold_vpopcntd:
5768; CHECK:       # %bb.0:
5769; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5770; CHECK-NEXT:    #APP
5771; CHECK-NEXT:    nop
5772; CHECK-NEXT:    #NO_APP
5773; CHECK-NEXT:    vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
5774; CHECK-NEXT:    retq
5775  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5776  %2 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a0)
5777  ret <16 x i32> %2
5778}
5779declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readonly
5780
5781define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) {
5782; CHECK-LABEL: stack_fold_vpopcntq:
5783; CHECK:       # %bb.0:
5784; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5785; CHECK-NEXT:    #APP
5786; CHECK-NEXT:    nop
5787; CHECK-NEXT:    #NO_APP
5788; CHECK-NEXT:    vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
5789; CHECK-NEXT:    retq
5790  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5791  %2 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a0)
5792  ret <8 x i64> %2
5793}
5794declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
5795
5796define <16 x i32> @stack_fold_pord(<16 x i32> %a0, <16 x i32> %a1) {
5797; CHECK-LABEL: stack_fold_pord:
5798; CHECK:       # %bb.0:
5799; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5800; CHECK-NEXT:    #APP
5801; CHECK-NEXT:    nop
5802; CHECK-NEXT:    #NO_APP
5803; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5804; CHECK-NEXT:    retq
5805  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5806  %2 = or <16 x i32> %a0, %a1
5807  ret <16 x i32> %2
5808}
5809
5810define <16 x i32> @stack_fold_pord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
5811; CHECK-LABEL: stack_fold_pord_commuted:
5812; CHECK:       # %bb.0:
5813; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5814; CHECK-NEXT:    #APP
5815; CHECK-NEXT:    nop
5816; CHECK-NEXT:    #NO_APP
5817; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5818; CHECK-NEXT:    retq
5819  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5820  %2 = or <16 x i32> %a1, %a0
5821  ret <16 x i32> %2
5822}
5823
5824define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
5825; CHECK-LABEL: stack_fold_pord_mask:
5826; CHECK:       # %bb.0:
5827; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5828; CHECK-NEXT:    vmovaps %zmm0, %zmm1
5829; CHECK-NEXT:    #APP
5830; CHECK-NEXT:    nop
5831; CHECK-NEXT:    #NO_APP
5832; CHECK-NEXT:    kmovd %esi, %k1
5833; CHECK-NEXT:    vmovaps (%rdi), %zmm0
5834; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5835; CHECK-NEXT:    retq
5836  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5837  %2 = or <16 x i32> %a0, %a1
5838  %3 = bitcast i16 %mask to <16 x i1>
5839  ; load needed to keep the operation from being scheduled about the asm block
5840  %4 = load <16 x i32>, ptr %a2
5841  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5842  ret <16 x i32> %5
5843}
5844
5845define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
5846; CHECK-LABEL: stack_fold_pord_mask_commuted:
5847; CHECK:       # %bb.0:
5848; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5849; CHECK-NEXT:    vmovaps %zmm0, %zmm1
5850; CHECK-NEXT:    #APP
5851; CHECK-NEXT:    nop
5852; CHECK-NEXT:    #NO_APP
5853; CHECK-NEXT:    kmovd %esi, %k1
5854; CHECK-NEXT:    vmovaps (%rdi), %zmm0
5855; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5856; CHECK-NEXT:    retq
5857  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5858  %2 = or <16 x i32> %a1, %a0
5859  %3 = bitcast i16 %mask to <16 x i1>
5860  ; load needed to keep the operation from being scheduled about the asm block
5861  %4 = load <16 x i32>, ptr %a2
5862  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5863  ret <16 x i32> %5
5864}
5865
5866define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5867; CHECK-LABEL: stack_fold_pord_maskz:
5868; CHECK:       # %bb.0:
5869; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5870; CHECK-NEXT:    #APP
5871; CHECK-NEXT:    nop
5872; CHECK-NEXT:    #NO_APP
5873; CHECK-NEXT:    kmovd %edi, %k1
5874; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5875; CHECK-NEXT:    retq
5876  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5877  %2 = or <16 x i32> %a0, %a1
5878  %3 = bitcast i16 %mask to <16 x i1>
5879  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5880  ret <16 x i32> %4
5881}
5882
5883define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5884; CHECK-LABEL: stack_fold_pord_maskz_commuted:
5885; CHECK:       # %bb.0:
5886; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5887; CHECK-NEXT:    #APP
5888; CHECK-NEXT:    nop
5889; CHECK-NEXT:    #NO_APP
5890; CHECK-NEXT:    kmovd %edi, %k1
5891; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5892; CHECK-NEXT:    retq
5893  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5894  %2 = or <16 x i32> %a1, %a0
5895  %3 = bitcast i16 %mask to <16 x i1>
5896  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5897  ret <16 x i32> %4
5898}
5899
5900define <8 x i64> @stack_fold_porq(<8 x i64> %a0, <8 x i64> %a1) {
5901; CHECK-LABEL: stack_fold_porq:
5902; CHECK:       # %bb.0:
5903; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5904; CHECK-NEXT:    #APP
5905; CHECK-NEXT:    nop
5906; CHECK-NEXT:    #NO_APP
5907; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5908; CHECK-NEXT:    retq
5909  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5910  %2 = or <8 x i64> %a0, %a1
5911  ret <8 x i64> %2
5912}
5913
5914define <8 x i64> @stack_fold_porq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5915; CHECK-LABEL: stack_fold_porq_commuted:
5916; CHECK:       # %bb.0:
5917; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5918; CHECK-NEXT:    #APP
5919; CHECK-NEXT:    nop
5920; CHECK-NEXT:    #NO_APP
5921; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5922; CHECK-NEXT:    retq
5923  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5924  %2 = or <8 x i64> %a1, %a0
5925  ret <8 x i64> %2
5926}
5927
5928define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5929; CHECK-LABEL: stack_fold_porq_mask:
5930; CHECK:       # %bb.0:
5931; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5932; CHECK-NEXT:    vmovapd %zmm0, %zmm1
5933; CHECK-NEXT:    #APP
5934; CHECK-NEXT:    nop
5935; CHECK-NEXT:    #NO_APP
5936; CHECK-NEXT:    kmovd %esi, %k1
5937; CHECK-NEXT:    vmovapd (%rdi), %zmm0
5938; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5939; CHECK-NEXT:    retq
5940  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5941  %2 = or <8 x i64> %a0, %a1
5942  %3 = bitcast i8 %mask to <8 x i1>
5943  ; load needed to keep the operation from being scheduled about the asm block
5944  %4 = load <8 x i64>, ptr %a2
5945  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5946  ret <8 x i64> %5
5947}
5948
5949define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5950; CHECK-LABEL: stack_fold_porq_mask_commuted:
5951; CHECK:       # %bb.0:
5952; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5953; CHECK-NEXT:    vmovapd %zmm0, %zmm1
5954; CHECK-NEXT:    #APP
5955; CHECK-NEXT:    nop
5956; CHECK-NEXT:    #NO_APP
5957; CHECK-NEXT:    kmovd %esi, %k1
5958; CHECK-NEXT:    vmovapd (%rdi), %zmm0
5959; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5960; CHECK-NEXT:    retq
5961  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5962  %2 = or <8 x i64> %a1, %a0
5963  %3 = bitcast i8 %mask to <8 x i1>
5964  ; load needed to keep the operation from being scheduled about the asm block
5965  %4 = load <8 x i64>, ptr %a2
5966  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5967  ret <8 x i64> %5
5968}
5969
5970define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5971; CHECK-LABEL: stack_fold_porq_maskz:
5972; CHECK:       # %bb.0:
5973; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5974; CHECK-NEXT:    #APP
5975; CHECK-NEXT:    nop
5976; CHECK-NEXT:    #NO_APP
5977; CHECK-NEXT:    kmovd %edi, %k1
5978; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5979; CHECK-NEXT:    retq
5980  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5981  %2 = or <8 x i64> %a0, %a1
5982  %3 = bitcast i8 %mask to <8 x i1>
5983  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5984  ret <8 x i64> %4
5985}
5986
5987define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5988; CHECK-LABEL: stack_fold_porq_maskz_commuted:
5989; CHECK:       # %bb.0:
5990; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5991; CHECK-NEXT:    #APP
5992; CHECK-NEXT:    nop
5993; CHECK-NEXT:    #NO_APP
5994; CHECK-NEXT:    kmovd %edi, %k1
5995; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5996; CHECK-NEXT:    retq
5997  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5998  %2 = or <8 x i64> %a1, %a0
5999  %3 = bitcast i8 %mask to <8 x i1>
6000  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
6001  ret <8 x i64> %4
6002}
6003
6004define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) {
6005; CHECK-LABEL: stack_fold_psadbw:
6006; CHECK:       # %bb.0:
6007; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6008; CHECK-NEXT:    #APP
6009; CHECK-NEXT:    nop
6010; CHECK-NEXT:    #NO_APP
6011; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6012; CHECK-NEXT:    retq
6013  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6014  %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a0, <64 x i8> %a1)
6015  ret <8 x i64> %2
6016}
6017declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone
6018
6019define <8 x i64> @stack_fold_psadbw_commute(<64 x i8> %a0, <64 x i8> %a1) {
6020; CHECK-LABEL: stack_fold_psadbw_commute:
6021; CHECK:       # %bb.0:
6022; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6023; CHECK-NEXT:    #APP
6024; CHECK-NEXT:    nop
6025; CHECK-NEXT:    #NO_APP
6026; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6027; CHECK-NEXT:    retq
6028  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6029  %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a1, <64 x i8> %a0)
6030  ret <8 x i64> %2
6031}
6032
6033define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) {
6034; CHECK-LABEL: stack_fold_pshufb_zmm:
6035; CHECK:       # %bb.0:
6036; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6037; CHECK-NEXT:    #APP
6038; CHECK-NEXT:    nop
6039; CHECK-NEXT:    #NO_APP
6040; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6041; CHECK-NEXT:    retq
6042  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6043  %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6044  ret <64 x i8> %2
6045}
6046declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
6047
6048define <64 x i8> @stack_fold_pshufb_zmm_mask(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
6049; CHECK-LABEL: stack_fold_pshufb_zmm_mask:
6050; CHECK:       # %bb.0:
6051; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6052; CHECK-NEXT:    #APP
6053; CHECK-NEXT:    nop
6054; CHECK-NEXT:    #NO_APP
6055; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
6056; CHECK-NEXT:    kmovq %rsi, %k1
6057; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
6058; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
6059; CHECK-NEXT:    retq
6060  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6061  %2 = load <64 x i8>, ptr %passthru
6062  %3 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6063  %4 = bitcast i64 %mask to <64 x i1>
6064  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %2
6065  ret <64 x i8> %5
6066}
6067
6068define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
6069; CHECK-LABEL: stack_fold_pshufb_zmm_maskz:
6070; CHECK:       # %bb.0:
6071; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6072; CHECK-NEXT:    #APP
6073; CHECK-NEXT:    nop
6074; CHECK-NEXT:    #NO_APP
6075; CHECK-NEXT:    kmovq %rdi, %k1
6076; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6077; CHECK-NEXT:    retq
6078  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6079  %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6080  %3 = bitcast i64 %mask to <64 x i1>
6081  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
6082  ret <64 x i8> %4
6083}
6084
6085define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) {
6086; CHECK-LABEL: stack_fold_pshufd_zmm:
6087; CHECK:       # %bb.0:
6088; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6089; CHECK-NEXT:    #APP
6090; CHECK-NEXT:    nop
6091; CHECK-NEXT:    #NO_APP
6092; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6093; CHECK-NEXT:    # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6094; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
6095; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
6096; CHECK-NEXT:    retq
6097  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6098  %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6099  %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
6100  ret <16 x i32> %3
6101}
6102
6103define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
6104; CHECK-LABEL: stack_fold_pshufd_zmm_mask:
6105; CHECK:       # %bb.0:
6106; CHECK-NEXT:    pushq %rax
6107; CHECK-NEXT:    .cfi_def_cfa_offset 16
6108; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6109; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6110; CHECK-NEXT:    #APP
6111; CHECK-NEXT:    nop
6112; CHECK-NEXT:    #NO_APP
6113; CHECK-NEXT:    kmovd %edi, %k1
6114; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6115; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6116; CHECK-NEXT:    # zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6117; CHECK-NEXT:    popq %rax
6118; CHECK-NEXT:    .cfi_def_cfa_offset 8
6119; CHECK-NEXT:    retq
6120  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6121  %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6122  %3 = bitcast i16 %mask to <16 x i1>
6123  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %passthru
6124  ret <16 x i32> %4
6125}
6126
6127define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) {
6128; CHECK-LABEL: stack_fold_pshufd_zmm_maskz:
6129; CHECK:       # %bb.0:
6130; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6131; CHECK-NEXT:    #APP
6132; CHECK-NEXT:    nop
6133; CHECK-NEXT:    #NO_APP
6134; CHECK-NEXT:    kmovd %edi, %k1
6135; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6136; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6137; CHECK-NEXT:    retq
6138  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6139  %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6140  %3 = bitcast i16 %mask to <16 x i1>
6141  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6142  ret <16 x i32> %4
6143}
6144
6145define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) {
6146; CHECK-LABEL: stack_fold_pshufhw_zmm:
6147; CHECK:       # %bb.0:
6148; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6149; CHECK-NEXT:    #APP
6150; CHECK-NEXT:    nop
6151; CHECK-NEXT:    #NO_APP
6152; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6153; CHECK-NEXT:    # zmm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6154; CHECK-NEXT:    retq
6155  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6156  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6157  ret <32 x i16> %2
6158}
6159
6160define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
6161; CHECK-LABEL: stack_fold_pshufhw_zmm_mask:
6162; CHECK:       # %bb.0:
6163; CHECK-NEXT:    pushq %rax
6164; CHECK-NEXT:    .cfi_def_cfa_offset 16
6165; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6166; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6167; CHECK-NEXT:    #APP
6168; CHECK-NEXT:    nop
6169; CHECK-NEXT:    #NO_APP
6170; CHECK-NEXT:    kmovd %edi, %k1
6171; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6172; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6173; CHECK-NEXT:    # zmm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6174; CHECK-NEXT:    popq %rax
6175; CHECK-NEXT:    .cfi_def_cfa_offset 8
6176; CHECK-NEXT:    retq
6177  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6178  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6179  %3 = bitcast i32 %mask to <32 x i1>
6180  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru
6181  ret <32 x i16> %4
6182}
6183
6184define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
6185; CHECK-LABEL: stack_fold_pshufhw_zmm_maskz:
6186; CHECK:       # %bb.0:
6187; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6188; CHECK-NEXT:    #APP
6189; CHECK-NEXT:    nop
6190; CHECK-NEXT:    #NO_APP
6191; CHECK-NEXT:    kmovd %edi, %k1
6192; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6193; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6194; CHECK-NEXT:    retq
6195  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6196  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6197  %3 = bitcast i32 %mask to <32 x i1>
6198  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
6199  ret <32 x i16> %4
6200}
6201
6202define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) {
6203; CHECK-LABEL: stack_fold_pshuflw_zmm:
6204; CHECK:       # %bb.0:
6205; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6206; CHECK-NEXT:    #APP
6207; CHECK-NEXT:    nop
6208; CHECK-NEXT:    #NO_APP
6209; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6210; CHECK-NEXT:    # zmm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6211; CHECK-NEXT:    retq
6212  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6213  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6214  ret <32 x i16> %2
6215}
6216
6217define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
6218; CHECK-LABEL: stack_fold_pshuflw_zmm_mask:
6219; CHECK:       # %bb.0:
6220; CHECK-NEXT:    pushq %rax
6221; CHECK-NEXT:    .cfi_def_cfa_offset 16
6222; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6223; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6224; CHECK-NEXT:    #APP
6225; CHECK-NEXT:    nop
6226; CHECK-NEXT:    #NO_APP
6227; CHECK-NEXT:    kmovd %edi, %k1
6228; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6229; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6230; CHECK-NEXT:    # zmm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6231; CHECK-NEXT:    popq %rax
6232; CHECK-NEXT:    .cfi_def_cfa_offset 8
6233; CHECK-NEXT:    retq
6234  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6235  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6236  %3 = bitcast i32 %mask to <32 x i1>
6237  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru
6238  ret <32 x i16> %4
6239}
6240
6241define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
6242; CHECK-LABEL: stack_fold_pshuflw_zmm_maskz:
6243; CHECK:       # %bb.0:
6244; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6245; CHECK-NEXT:    #APP
6246; CHECK-NEXT:    nop
6247; CHECK-NEXT:    #NO_APP
6248; CHECK-NEXT:    kmovd %edi, %k1
6249; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6250; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6251; CHECK-NEXT:    retq
6252  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6253  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6254  %3 = bitcast i32 %mask to <32 x i1>
6255  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
6256  ret <32 x i16> %4
6257}
6258
6259define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) {
6260; CHECK-LABEL: stack_fold_pslld:
6261; CHECK:       # %bb.0:
6262; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6263; CHECK-NEXT:    #APP
6264; CHECK-NEXT:    nop
6265; CHECK-NEXT:    #NO_APP
6266; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6267; CHECK-NEXT:    retq
6268  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6269  %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6270  ret <16 x i32> %2
6271}
6272declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6273
6274define <16 x i32> @stack_fold_pslld_mask(ptr %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6275; CHECK-LABEL: stack_fold_pslld_mask:
6276; CHECK:       # %bb.0:
6277; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6278; CHECK-NEXT:    #APP
6279; CHECK-NEXT:    nop
6280; CHECK-NEXT:    #NO_APP
6281; CHECK-NEXT:    kmovd %esi, %k1
6282; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
6283; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
6284; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
6285; CHECK-NEXT:    retq
6286  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6287  %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6288  %3 = bitcast i16 %mask to <16 x i1>
6289  %4 = load <16 x i32>, ptr %passthru
6290  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6291  ret <16 x i32> %5
6292}
6293
6294define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6295; CHECK-LABEL: stack_fold_pslld_maskz:
6296; CHECK:       # %bb.0:
6297; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6298; CHECK-NEXT:    #APP
6299; CHECK-NEXT:    nop
6300; CHECK-NEXT:    #NO_APP
6301; CHECK-NEXT:    kmovd %edi, %k1
6302; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 16-byte Folded Reload
6303; CHECK-NEXT:    retq
6304  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6305  %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6306  %3 = bitcast i16 %mask to <16 x i1>
6307  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6308  ret <16 x i32> %4
6309}
6310
6311define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) {
6312; CHECK-LABEL: stack_fold_pslldi:
6313; CHECK:       # %bb.0:
6314; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6315; CHECK-NEXT:    #APP
6316; CHECK-NEXT:    nop
6317; CHECK-NEXT:    #NO_APP
6318; CHECK-NEXT:    vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6319; CHECK-NEXT:    retq
6320  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6321  %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6322  ret <16 x i32> %2
6323}
6324declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
6325
6326define <16 x i32> @stack_fold_pslldi_mask(ptr %passthru, <16 x i32> %a0, i16 %mask) {
6327; CHECK-LABEL: stack_fold_pslldi_mask:
6328; CHECK:       # %bb.0:
6329; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6330; CHECK-NEXT:    #APP
6331; CHECK-NEXT:    nop
6332; CHECK-NEXT:    #NO_APP
6333; CHECK-NEXT:    kmovd %esi, %k1
6334; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
6335; CHECK-NEXT:    vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
6336; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
6337; CHECK-NEXT:    retq
6338  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6339  %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6340  %3 = bitcast i16 %mask to <16 x i1>
6341  %4 = load <16 x i32>, ptr %passthru
6342  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6343  ret <16 x i32> %5
6344}
6345
6346define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) {
6347; CHECK-LABEL: stack_fold_pslldi_maskz:
6348; CHECK:       # %bb.0:
6349; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6350; CHECK-NEXT:    #APP
6351; CHECK-NEXT:    nop
6352; CHECK-NEXT:    #NO_APP
6353; CHECK-NEXT:    kmovd %edi, %k1
6354; CHECK-NEXT:    vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6355; CHECK-NEXT:    retq
6356  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6357  %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6358  %3 = bitcast i16 %mask to <16 x i1>
6359  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6360  ret <16 x i32> %4
6361}
6362
6363define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) {
6364; CHECK-LABEL: stack_fold_pslldq:
6365; CHECK:       # %bb.0:
6366; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6367; CHECK-NEXT:    #APP
6368; CHECK-NEXT:    nop
6369; CHECK-NEXT:    #NO_APP
6370; CHECK-NEXT:    vpslldq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6371; CHECK-NEXT:    # zmm0 = zero,mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,mem[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,mem[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,mem[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
6372; CHECK-NEXT:    retq
6373  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6374  %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
6375  ret <64 x i8> %2
6376}
6377
6378define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) {
6379; CHECK-LABEL: stack_fold_psllq:
6380; CHECK:       # %bb.0:
6381; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6382; CHECK-NEXT:    #APP
6383; CHECK-NEXT:    nop
6384; CHECK-NEXT:    #NO_APP
6385; CHECK-NEXT:    vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6386; CHECK-NEXT:    retq
6387  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6388  %2 = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1)
6389  ret <8 x i64> %2
6390}
6391declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6392
6393define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) {
6394; CHECK-LABEL: stack_fold_psllqi:
6395; CHECK:       # %bb.0:
6396; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6397; CHECK-NEXT:    #APP
6398; CHECK-NEXT:    nop
6399; CHECK-NEXT:    #NO_APP
6400; CHECK-NEXT:    vpsllq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6401; CHECK-NEXT:    retq
6402  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6403  %2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1)
6404  ret <8 x i64> %2
6405}
6406declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
6407
6408define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) {
6409; CHECK-LABEL: stack_fold_psllvd:
6410; CHECK:       # %bb.0:
6411; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6412; CHECK-NEXT:    #APP
6413; CHECK-NEXT:    nop
6414; CHECK-NEXT:    #NO_APP
6415; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6416; CHECK-NEXT:    retq
6417  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6418  %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6419  ret <16 x i32> %2
6420}
6421declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6422
6423define <16 x i32> @stack_fold_psllvd_mask(ptr %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
6424; CHECK-LABEL: stack_fold_psllvd_mask:
6425; CHECK:       # %bb.0:
6426; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6427; CHECK-NEXT:    #APP
6428; CHECK-NEXT:    nop
6429; CHECK-NEXT:    #NO_APP
6430; CHECK-NEXT:    kmovd %esi, %k1
6431; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
6432; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
6433; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
6434; CHECK-NEXT:    retq
6435  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6436  %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6437  %3 = bitcast i16 %mask to <16 x i1>
6438  %4 = load <16 x i32>, ptr %passthru
6439  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6440  ret <16 x i32> %5
6441}
6442
6443define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
6444; CHECK-LABEL: stack_fold_psllvd_maskz:
6445; CHECK:       # %bb.0:
6446; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6447; CHECK-NEXT:    #APP
6448; CHECK-NEXT:    nop
6449; CHECK-NEXT:    #NO_APP
6450; CHECK-NEXT:    kmovd %edi, %k1
6451; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6452; CHECK-NEXT:    retq
6453  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6454  %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6455  %3 = bitcast i16 %mask to <16 x i1>
6456  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6457  ret <16 x i32> %4
6458}
6459
6460define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) {
6461; CHECK-LABEL: stack_fold_psllvq:
6462; CHECK:       # %bb.0:
6463; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6464; CHECK-NEXT:    #APP
6465; CHECK-NEXT:    nop
6466; CHECK-NEXT:    #NO_APP
6467; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6468; CHECK-NEXT:    retq
6469  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6470  %2 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
6471  ret <8 x i64> %2
6472}
6473declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6474
6475define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) {
6476; CHECK-LABEL: stack_fold_psllvw:
6477; CHECK:       # %bb.0:
6478; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6479; CHECK-NEXT:    #APP
6480; CHECK-NEXT:    nop
6481; CHECK-NEXT:    #NO_APP
6482; CHECK-NEXT:    vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6483; CHECK-NEXT:    retq
6484  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6485  %2 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %a0, <32 x i16> %a1)
6486  ret <32 x i16> %2
6487}
6488declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6489
6490define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) {
6491; CHECK-LABEL: stack_fold_psllw:
6492; CHECK:       # %bb.0:
6493; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6494; CHECK-NEXT:    #APP
6495; CHECK-NEXT:    nop
6496; CHECK-NEXT:    #NO_APP
6497; CHECK-NEXT:    vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6498; CHECK-NEXT:    retq
6499  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6500  %2 = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1)
6501  ret <32 x i16> %2
6502}
6503declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6504
6505define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) {
6506; CHECK-LABEL: stack_fold_psllwi:
6507; CHECK:       # %bb.0:
6508; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6509; CHECK-NEXT:    #APP
6510; CHECK-NEXT:    nop
6511; CHECK-NEXT:    #NO_APP
6512; CHECK-NEXT:    vpsllw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6513; CHECK-NEXT:    retq
6514  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6515  %2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1)
6516  ret <32 x i16> %2
6517}
6518declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone
6519
6520define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) {
6521; CHECK-LABEL: stack_fold_psrad:
6522; CHECK:       # %bb.0:
6523; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6524; CHECK-NEXT:    #APP
6525; CHECK-NEXT:    nop
6526; CHECK-NEXT:    #NO_APP
6527; CHECK-NEXT:    vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6528; CHECK-NEXT:    retq
6529  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6530  %2 = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1)
6531  ret <16 x i32> %2
6532}
6533declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6534
6535define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) {
6536; CHECK-LABEL: stack_fold_psradi:
6537; CHECK:       # %bb.0:
6538; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6539; CHECK-NEXT:    #APP
6540; CHECK-NEXT:    nop
6541; CHECK-NEXT:    #NO_APP
6542; CHECK-NEXT:    vpsrad $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6543; CHECK-NEXT:    retq
6544  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6545  %2 = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 1)
6546  ret <16 x i32> %2
6547}
6548declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
6549
6550define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) {
6551; CHECK-LABEL: stack_fold_psraq:
6552; CHECK:       # %bb.0:
6553; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6554; CHECK-NEXT:    #APP
6555; CHECK-NEXT:    nop
6556; CHECK-NEXT:    #NO_APP
6557; CHECK-NEXT:    vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6558; CHECK-NEXT:    retq
6559  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6560  %2 = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1)
6561  ret <8 x i64> %2
6562}
6563declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6564
6565define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) {
6566; CHECK-LABEL: stack_fold_psraqi:
6567; CHECK:       # %bb.0:
6568; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6569; CHECK-NEXT:    #APP
6570; CHECK-NEXT:    nop
6571; CHECK-NEXT:    #NO_APP
6572; CHECK-NEXT:    vpsraq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6573; CHECK-NEXT:    retq
6574  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6575  %2 = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 1)
6576  ret <8 x i64> %2
6577}
6578declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
6579
6580define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) {
6581; CHECK-LABEL: stack_fold_psravd:
6582; CHECK:       # %bb.0:
6583; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6584; CHECK-NEXT:    #APP
6585; CHECK-NEXT:    nop
6586; CHECK-NEXT:    #NO_APP
6587; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6588; CHECK-NEXT:    retq
6589  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6590  %2 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
6591  ret <16 x i32> %2
6592}
6593declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6594
6595define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) {
6596; CHECK-LABEL: stack_fold_psravq:
6597; CHECK:       # %bb.0:
6598; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6599; CHECK-NEXT:    #APP
6600; CHECK-NEXT:    nop
6601; CHECK-NEXT:    #NO_APP
6602; CHECK-NEXT:    vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6603; CHECK-NEXT:    retq
6604  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6605  %2 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
6606  ret <8 x i64> %2
6607}
6608declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6609
6610define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) {
6611; CHECK-LABEL: stack_fold_psravw:
6612; CHECK:       # %bb.0:
6613; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6614; CHECK-NEXT:    #APP
6615; CHECK-NEXT:    nop
6616; CHECK-NEXT:    #NO_APP
6617; CHECK-NEXT:    vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6618; CHECK-NEXT:    retq
6619  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6620  %2 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %a0, <32 x i16> %a1)
6621  ret <32 x i16> %2
6622}
6623declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6624
6625define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) {
6626; CHECK-LABEL: stack_fold_psraw:
6627; CHECK:       # %bb.0:
6628; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6629; CHECK-NEXT:    #APP
6630; CHECK-NEXT:    nop
6631; CHECK-NEXT:    #NO_APP
6632; CHECK-NEXT:    vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6633; CHECK-NEXT:    retq
6634  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6635  %2 = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1)
6636  ret <32 x i16> %2
6637}
6638declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6639
6640define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) {
6641; CHECK-LABEL: stack_fold_psrawi:
6642; CHECK:       # %bb.0:
6643; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6644; CHECK-NEXT:    #APP
6645; CHECK-NEXT:    nop
6646; CHECK-NEXT:    #NO_APP
6647; CHECK-NEXT:    vpsraw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6648; CHECK-NEXT:    retq
6649  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6650  %2 = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 1)
6651  ret <32 x i16> %2
6652}
6653declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone
6654
6655define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) {
6656; CHECK-LABEL: stack_fold_psrld:
6657; CHECK:       # %bb.0:
6658; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6659; CHECK-NEXT:    #APP
6660; CHECK-NEXT:    nop
6661; CHECK-NEXT:    #NO_APP
6662; CHECK-NEXT:    vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6663; CHECK-NEXT:    retq
6664  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6665  %2 = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1)
6666  ret <16 x i32> %2
6667}
6668declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6669
6670define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) {
6671; CHECK-LABEL: stack_fold_psrldi:
6672; CHECK:       # %bb.0:
6673; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6674; CHECK-NEXT:    #APP
6675; CHECK-NEXT:    nop
6676; CHECK-NEXT:    #NO_APP
6677; CHECK-NEXT:    vpsrld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6678; CHECK-NEXT:    retq
6679  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6680  %2 = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 1)
6681  ret <16 x i32> %2
6682}
6683declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
6684
6685define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) {
6686; CHECK-LABEL: stack_fold_psrldq:
6687; CHECK:       # %bb.0:
6688; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6689; CHECK-NEXT:    #APP
6690; CHECK-NEXT:    nop
6691; CHECK-NEXT:    #NO_APP
6692; CHECK-NEXT:    vpsrldq $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6693; CHECK-NEXT:    # zmm0 = mem[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,mem[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,mem[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,mem[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
6694; CHECK-NEXT:    retq
6695  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6696  %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
6697  ret <64 x i8> %2
6698}
6699
6700define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) {
6701; CHECK-LABEL: stack_fold_psrlq:
6702; CHECK:       # %bb.0:
6703; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6704; CHECK-NEXT:    #APP
6705; CHECK-NEXT:    nop
6706; CHECK-NEXT:    #NO_APP
6707; CHECK-NEXT:    vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6708; CHECK-NEXT:    retq
6709  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6710  %2 = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1)
6711  ret <8 x i64> %2
6712}
6713declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6714
6715define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) {
6716; CHECK-LABEL: stack_fold_psrlqi:
6717; CHECK:       # %bb.0:
6718; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6719; CHECK-NEXT:    #APP
6720; CHECK-NEXT:    nop
6721; CHECK-NEXT:    #NO_APP
6722; CHECK-NEXT:    vpsrlq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6723; CHECK-NEXT:    retq
6724  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6725  %2 = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 1)
6726  ret <8 x i64> %2
6727}
6728declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
6729
6730define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) {
6731; CHECK-LABEL: stack_fold_psrlvd:
6732; CHECK:       # %bb.0:
6733; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6734; CHECK-NEXT:    #APP
6735; CHECK-NEXT:    nop
6736; CHECK-NEXT:    #NO_APP
6737; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6738; CHECK-NEXT:    retq
6739  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6740  %2 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6741  ret <16 x i32> %2
6742}
6743declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6744
6745define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) {
6746; CHECK-LABEL: stack_fold_psrlvq:
6747; CHECK:       # %bb.0:
6748; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6749; CHECK-NEXT:    #APP
6750; CHECK-NEXT:    nop
6751; CHECK-NEXT:    #NO_APP
6752; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6753; CHECK-NEXT:    retq
6754  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6755  %2 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
6756  ret <8 x i64> %2
6757}
6758declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6759
6760define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) {
6761; CHECK-LABEL: stack_fold_psrlvw:
6762; CHECK:       # %bb.0:
6763; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6764; CHECK-NEXT:    #APP
6765; CHECK-NEXT:    nop
6766; CHECK-NEXT:    #NO_APP
6767; CHECK-NEXT:    vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6768; CHECK-NEXT:    retq
6769  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6770  %2 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %a0, <32 x i16> %a1)
6771  ret <32 x i16> %2
6772}
6773declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6774
6775define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) {
6776; CHECK-LABEL: stack_fold_psrlw:
6777; CHECK:       # %bb.0:
6778; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6779; CHECK-NEXT:    #APP
6780; CHECK-NEXT:    nop
6781; CHECK-NEXT:    #NO_APP
6782; CHECK-NEXT:    vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6783; CHECK-NEXT:    retq
6784  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6785  %2 = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1)
6786  ret <32 x i16> %2
6787}
6788declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6789
6790define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) {
6791; CHECK-LABEL: stack_fold_psrlwi:
6792; CHECK:       # %bb.0:
6793; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6794; CHECK-NEXT:    #APP
6795; CHECK-NEXT:    nop
6796; CHECK-NEXT:    #NO_APP
6797; CHECK-NEXT:    vpsrlw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6798; CHECK-NEXT:    retq
6799  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6800  %2 = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 1)
6801  ret <32 x i16> %2
6802}
6803declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone
6804
6805define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) {
6806; CHECK-LABEL: stack_fold_psubb:
6807; CHECK:       # %bb.0:
6808; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6809; CHECK-NEXT:    #APP
6810; CHECK-NEXT:    nop
6811; CHECK-NEXT:    #NO_APP
6812; CHECK-NEXT:    vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6813; CHECK-NEXT:    retq
6814  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6815  %2 = sub <64 x i8> %a0, %a1
6816  ret <64 x i8> %2
6817}
6818
6819define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) {
6820; CHECK-LABEL: stack_fold_psubd:
6821; CHECK:       # %bb.0:
6822; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6823; CHECK-NEXT:    #APP
6824; CHECK-NEXT:    nop
6825; CHECK-NEXT:    #NO_APP
6826; CHECK-NEXT:    vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6827; CHECK-NEXT:    retq
6828  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6829  %2 = sub <16 x i32> %a0, %a1
6830  ret <16 x i32> %2
6831}
6832
6833define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) {
6834; CHECK-LABEL: stack_fold_psubq:
6835; CHECK:       # %bb.0:
6836; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6837; CHECK-NEXT:    #APP
6838; CHECK-NEXT:    nop
6839; CHECK-NEXT:    #NO_APP
6840; CHECK-NEXT:    vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6841; CHECK-NEXT:    retq
6842  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6843  %2 = sub <8 x i64> %a0, %a1
6844  ret <8 x i64> %2
6845}
6846
6847define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) {
6848; CHECK-LABEL: stack_fold_psubsb:
6849; CHECK:       # %bb.0:
6850; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6851; CHECK-NEXT:    #APP
6852; CHECK-NEXT:    nop
6853; CHECK-NEXT:    #NO_APP
6854; CHECK-NEXT:    vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6855; CHECK-NEXT:    retq
6856  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6857  %2 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
6858  ret <64 x i8> %2
6859}
6860
6861define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) {
6862; CHECK-LABEL: stack_fold_psubsw:
6863; CHECK:       # %bb.0:
6864; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6865; CHECK-NEXT:    #APP
6866; CHECK-NEXT:    nop
6867; CHECK-NEXT:    #NO_APP
6868; CHECK-NEXT:    vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6869; CHECK-NEXT:    retq
6870  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6871  %2 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
6872  ret <32 x i16> %2
6873}
6874
6875define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) {
6876; CHECK-LABEL: stack_fold_psubusb:
6877; CHECK:       # %bb.0:
6878; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6879; CHECK-NEXT:    #APP
6880; CHECK-NEXT:    nop
6881; CHECK-NEXT:    #NO_APP
6882; CHECK-NEXT:    vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6883; CHECK-NEXT:    retq
6884  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6885  %2 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
6886  ret <64 x i8> %2
6887}
6888
6889define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) {
6890; CHECK-LABEL: stack_fold_psubusw:
6891; CHECK:       # %bb.0:
6892; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6893; CHECK-NEXT:    #APP
6894; CHECK-NEXT:    nop
6895; CHECK-NEXT:    #NO_APP
6896; CHECK-NEXT:    vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6897; CHECK-NEXT:    retq
6898  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6899  %2 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
6900  ret <32 x i16> %2
6901}
6902
6903define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) {
6904; CHECK-LABEL: stack_fold_psubw:
6905; CHECK:       # %bb.0:
6906; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6907; CHECK-NEXT:    #APP
6908; CHECK-NEXT:    nop
6909; CHECK-NEXT:    #NO_APP
6910; CHECK-NEXT:    vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6911; CHECK-NEXT:    retq
6912  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6913  %2 = sub <32 x i16> %a0, %a1
6914  ret <32 x i16> %2
6915}
6916
6917define <8 x i64> @stack_fold_shufi64x2(<8 x i64> %a, <8 x i64> %b) {
6918; CHECK-LABEL: stack_fold_shufi64x2:
6919; CHECK:       # %bb.0:
6920; CHECK-NEXT:    pushq %rax
6921; CHECK-NEXT:    .cfi_def_cfa_offset 16
6922; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6923; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6924; CHECK-NEXT:    #APP
6925; CHECK-NEXT:    nop
6926; CHECK-NEXT:    #NO_APP
6927; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6928; CHECK-NEXT:    vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6929; CHECK-NEXT:    # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
6930; CHECK-NEXT:    popq %rax
6931; CHECK-NEXT:    .cfi_def_cfa_offset 8
6932; CHECK-NEXT:    retq
6933  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6934  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6935  ret <8 x i64> %2
6936}
6937
6938define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask, ptr %passthru) {
6939; CHECK-LABEL: stack_fold_shufi64x2_mask:
6940; CHECK:       # %bb.0:
6941; CHECK-NEXT:    pushq %rax
6942; CHECK-NEXT:    .cfi_def_cfa_offset 16
6943; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6944; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6945; CHECK-NEXT:    #APP
6946; CHECK-NEXT:    nop
6947; CHECK-NEXT:    #NO_APP
6948; CHECK-NEXT:    kmovd %edi, %k1
6949; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm1
6950; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6951; CHECK-NEXT:    vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
6952; CHECK-NEXT:    # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
6953; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
6954; CHECK-NEXT:    popq %rax
6955; CHECK-NEXT:    .cfi_def_cfa_offset 8
6956; CHECK-NEXT:    retq
6957  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6958  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6959  %3 = bitcast i8 %mask to <8 x i1>
6960  ; load needed to keep the operation from being scheduled above the asm block
6961  %4 = load <8 x i64>, ptr %passthru
6962  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
6963  ret <8 x i64> %5
6964}
6965
6966define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask, ptr %passthru) {
6967; CHECK-LABEL: stack_fold_shufi64x2_maskz:
6968; CHECK:       # %bb.0:
6969; CHECK-NEXT:    pushq %rax
6970; CHECK-NEXT:    .cfi_def_cfa_offset 16
6971; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6972; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6973; CHECK-NEXT:    #APP
6974; CHECK-NEXT:    nop
6975; CHECK-NEXT:    #NO_APP
6976; CHECK-NEXT:    kmovd %edi, %k1
6977; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6978; CHECK-NEXT:    vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6979; CHECK-NEXT:    # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
6980; CHECK-NEXT:    popq %rax
6981; CHECK-NEXT:    .cfi_def_cfa_offset 8
6982; CHECK-NEXT:    retq
6983  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6984  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6985  %3 = bitcast i8 %mask to <8 x i1>
6986  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
6987  ret <8 x i64> %4
6988}
6989
6990define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 %mask, ptr %passthru) {
6991; CHECK-LABEL: stack_fold_shufi32x4_mask:
6992; CHECK:       # %bb.0:
6993; CHECK-NEXT:    pushq %rax
6994; CHECK-NEXT:    .cfi_def_cfa_offset 16
6995; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6996; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6997; CHECK-NEXT:    #APP
6998; CHECK-NEXT:    nop
6999; CHECK-NEXT:    #NO_APP
7000; CHECK-NEXT:    kmovd %edi, %k1
7001; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm1
7002; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7003; CHECK-NEXT:    vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
7004; CHECK-NEXT:    # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
7005; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
7006; CHECK-NEXT:    popq %rax
7007; CHECK-NEXT:    .cfi_def_cfa_offset 8
7008; CHECK-NEXT:    retq
7009  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7010  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
7011  %3 = bitcast i16 %mask to <16 x i1>
7012  ; load needed to keep the operation from being scheduled above the asm block
7013  %4 = load <16 x i32>, ptr %passthru
7014  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7015  ret <16 x i32> %5
7016}
7017
7018define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
7019; CHECK-LABEL: stack_fold_shufi32x4_maskz:
7020; CHECK:       # %bb.0:
7021; CHECK-NEXT:    pushq %rax
7022; CHECK-NEXT:    .cfi_def_cfa_offset 16
7023; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7024; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7025; CHECK-NEXT:    #APP
7026; CHECK-NEXT:    nop
7027; CHECK-NEXT:    #NO_APP
7028; CHECK-NEXT:    kmovd %edi, %k1
7029; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7030; CHECK-NEXT:    vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7031; CHECK-NEXT:    # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
7032; CHECK-NEXT:    popq %rax
7033; CHECK-NEXT:    .cfi_def_cfa_offset 8
7034; CHECK-NEXT:    retq
7035  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7036  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
7037  %3 = bitcast i16 %mask to <16 x i1>
7038  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7039  ret <16 x i32> %4
7040}
7041
7042define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
7043; CHECK-LABEL: stack_fold_ternlogd:
7044; CHECK:       # %bb.0:
7045; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7046; CHECK-NEXT:    #APP
7047; CHECK-NEXT:    nop
7048; CHECK-NEXT:    #NO_APP
7049; CHECK-NEXT:    vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
7050; CHECK-NEXT:    # zmm0 = ~(zmm1 | (zmm0 ^ mem))
7051; CHECK-NEXT:    retq
7052  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7053  %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
7054  ret <16 x i32> %2
7055}
7056declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
7057
7058define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
7059; CHECK-LABEL: stack_fold_ternlogq:
7060; CHECK:       # %bb.0:
7061; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7062; CHECK-NEXT:    #APP
7063; CHECK-NEXT:    nop
7064; CHECK-NEXT:    #NO_APP
7065; CHECK-NEXT:    vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
7066; CHECK-NEXT:    # zmm0 = ~(zmm1 | (zmm0 ^ mem))
7067; CHECK-NEXT:    retq
7068  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7069  %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
7070  ret <8 x i64> %2
7071}
7072
7073declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
7074
7075define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
7076; CHECK-LABEL: stack_fold_punpckhbw_zmm:
7077; CHECK:       # %bb.0:
7078; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7079; CHECK-NEXT:    #APP
7080; CHECK-NEXT:    nop
7081; CHECK-NEXT:    #NO_APP
7082; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7083; CHECK-NEXT:    # zmm0 = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7084; CHECK-NEXT:    retq
7085  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7086  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7087  ret <64 x i8> %2
7088}
7089
7090define <64 x i8> @stack_fold_punpckhbw_mask_zmm(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
7091; CHECK-LABEL: stack_fold_punpckhbw_mask_zmm:
7092; CHECK:       # %bb.0:
7093; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7094; CHECK-NEXT:    #APP
7095; CHECK-NEXT:    nop
7096; CHECK-NEXT:    #NO_APP
7097; CHECK-NEXT:    kmovq %rsi, %k1
7098; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
7099; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
7100; CHECK-NEXT:    # zmm2 {%k1} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7101; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
7102; CHECK-NEXT:    retq
7103  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7104  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7105  %3 = bitcast i64 %mask to <64 x i1>
7106  ; load needed to keep the operation from being scheduled about the asm block
7107  %4 = load <64 x i8>, ptr %passthru
7108  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
7109  ret <64 x i8> %5
7110}
7111
7112define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
7113; CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm:
7114; CHECK:       # %bb.0:
7115; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7116; CHECK-NEXT:    #APP
7117; CHECK-NEXT:    nop
7118; CHECK-NEXT:    #NO_APP
7119; CHECK-NEXT:    kmovq %rdi, %k1
7120; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7121; CHECK-NEXT:    # zmm0 {%k1} {z} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7122; CHECK-NEXT:    retq
7123  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7124  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7125  %3 = bitcast i64 %mask to <64 x i1>
7126  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
7127  ret <64 x i8> %4
7128}
7129
7130define <16 x i32> @stack_fold_pxord(<16 x i32> %a0, <16 x i32> %a1) {
7131; CHECK-LABEL: stack_fold_pxord:
7132; CHECK:       # %bb.0:
7133; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7134; CHECK-NEXT:    #APP
7135; CHECK-NEXT:    nop
7136; CHECK-NEXT:    #NO_APP
7137; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7138; CHECK-NEXT:    retq
7139  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7140  %2 = xor <16 x i32> %a0, %a1
7141  ret <16 x i32> %2
7142}
7143
7144define <16 x i32> @stack_fold_pxord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
7145; CHECK-LABEL: stack_fold_pxord_commuted:
7146; CHECK:       # %bb.0:
7147; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7148; CHECK-NEXT:    #APP
7149; CHECK-NEXT:    nop
7150; CHECK-NEXT:    #NO_APP
7151; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7152; CHECK-NEXT:    retq
7153  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7154  %2 = xor <16 x i32> %a1, %a0
7155  ret <16 x i32> %2
7156}
7157
7158define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
7159; CHECK-LABEL: stack_fold_pxord_mask:
7160; CHECK:       # %bb.0:
7161; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7162; CHECK-NEXT:    vmovaps %zmm0, %zmm1
7163; CHECK-NEXT:    #APP
7164; CHECK-NEXT:    nop
7165; CHECK-NEXT:    #NO_APP
7166; CHECK-NEXT:    kmovd %esi, %k1
7167; CHECK-NEXT:    vmovaps (%rdi), %zmm0
7168; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7169; CHECK-NEXT:    retq
7170  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7171  %2 = xor <16 x i32> %a0, %a1
7172  %3 = bitcast i16 %mask to <16 x i1>
7173  ; load needed to keep the operation from being scheduled about the asm block
7174  %4 = load <16 x i32>, ptr %a2
7175  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7176  ret <16 x i32> %5
7177}
7178
7179define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
7180; CHECK-LABEL: stack_fold_pxord_mask_commuted:
7181; CHECK:       # %bb.0:
7182; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7183; CHECK-NEXT:    vmovaps %zmm0, %zmm1
7184; CHECK-NEXT:    #APP
7185; CHECK-NEXT:    nop
7186; CHECK-NEXT:    #NO_APP
7187; CHECK-NEXT:    kmovd %esi, %k1
7188; CHECK-NEXT:    vmovaps (%rdi), %zmm0
7189; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7190; CHECK-NEXT:    retq
7191  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7192  %2 = xor <16 x i32> %a1, %a0
7193  %3 = bitcast i16 %mask to <16 x i1>
7194  ; load needed to keep the operation from being scheduled about the asm block
7195  %4 = load <16 x i32>, ptr %a2
7196  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7197  ret <16 x i32> %5
7198}
7199
7200define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7201; CHECK-LABEL: stack_fold_pxord_maskz:
7202; CHECK:       # %bb.0:
7203; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7204; CHECK-NEXT:    #APP
7205; CHECK-NEXT:    nop
7206; CHECK-NEXT:    #NO_APP
7207; CHECK-NEXT:    kmovd %edi, %k1
7208; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7209; CHECK-NEXT:    retq
7210  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7211  %2 = xor <16 x i32> %a0, %a1
7212  %3 = bitcast i16 %mask to <16 x i1>
7213  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7214  ret <16 x i32> %4
7215}
7216
7217define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7218; CHECK-LABEL: stack_fold_pxord_maskz_commuted:
7219; CHECK:       # %bb.0:
7220; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7221; CHECK-NEXT:    #APP
7222; CHECK-NEXT:    nop
7223; CHECK-NEXT:    #NO_APP
7224; CHECK-NEXT:    kmovd %edi, %k1
7225; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7226; CHECK-NEXT:    retq
7227  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7228  %2 = xor <16 x i32> %a1, %a0
7229  %3 = bitcast i16 %mask to <16 x i1>
7230  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7231  ret <16 x i32> %4
7232}
7233
7234define <8 x i64> @stack_fold_pxorq(<8 x i64> %a0, <8 x i64> %a1) {
7235; CHECK-LABEL: stack_fold_pxorq:
7236; CHECK:       # %bb.0:
7237; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7238; CHECK-NEXT:    #APP
7239; CHECK-NEXT:    nop
7240; CHECK-NEXT:    #NO_APP
7241; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7242; CHECK-NEXT:    retq
7243  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7244  %2 = xor <8 x i64> %a0, %a1
7245  ret <8 x i64> %2
7246}
7247
7248define <8 x i64> @stack_fold_pxorq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
7249; CHECK-LABEL: stack_fold_pxorq_commuted:
7250; CHECK:       # %bb.0:
7251; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7252; CHECK-NEXT:    #APP
7253; CHECK-NEXT:    nop
7254; CHECK-NEXT:    #NO_APP
7255; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7256; CHECK-NEXT:    retq
7257  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7258  %2 = xor <8 x i64> %a1, %a0
7259  ret <8 x i64> %2
7260}
7261
7262define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
7263; CHECK-LABEL: stack_fold_pxorq_mask:
7264; CHECK:       # %bb.0:
7265; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7266; CHECK-NEXT:    vmovapd %zmm0, %zmm1
7267; CHECK-NEXT:    #APP
7268; CHECK-NEXT:    nop
7269; CHECK-NEXT:    #NO_APP
7270; CHECK-NEXT:    kmovd %esi, %k1
7271; CHECK-NEXT:    vmovapd (%rdi), %zmm0
7272; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7273; CHECK-NEXT:    retq
7274  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7275  %2 = xor <8 x i64> %a0, %a1
7276  %3 = bitcast i8 %mask to <8 x i1>
7277  ; load needed to keep the operation from being scheduled about the asm block
7278  %4 = load <8 x i64>, ptr %a2
7279  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
7280  ret <8 x i64> %5
7281}
7282
7283define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
7284; CHECK-LABEL: stack_fold_pxorq_mask_commuted:
7285; CHECK:       # %bb.0:
7286; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7287; CHECK-NEXT:    vmovapd %zmm0, %zmm1
7288; CHECK-NEXT:    #APP
7289; CHECK-NEXT:    nop
7290; CHECK-NEXT:    #NO_APP
7291; CHECK-NEXT:    kmovd %esi, %k1
7292; CHECK-NEXT:    vmovapd (%rdi), %zmm0
7293; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7294; CHECK-NEXT:    retq
7295  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7296  %2 = xor <8 x i64> %a1, %a0
7297  %3 = bitcast i8 %mask to <8 x i1>
7298  ; load needed to keep the operation from being scheduled about the asm block
7299  %4 = load <8 x i64>, ptr %a2
7300  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
7301  ret <8 x i64> %5
7302}
7303
7304define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7305; CHECK-LABEL: stack_fold_pxorq_maskz:
7306; CHECK:       # %bb.0:
7307; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7308; CHECK-NEXT:    #APP
7309; CHECK-NEXT:    nop
7310; CHECK-NEXT:    #NO_APP
7311; CHECK-NEXT:    kmovd %edi, %k1
7312; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7313; CHECK-NEXT:    retq
7314  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7315  %2 = xor <8 x i64> %a0, %a1
7316  %3 = bitcast i8 %mask to <8 x i1>
7317  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
7318  ret <8 x i64> %4
7319}
7320
7321define <8 x i64> @stack_fold_pxorq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7322; CHECK-LABEL: stack_fold_pxorq_maskz_commuted:
7323; CHECK:       # %bb.0:
7324; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7325; CHECK-NEXT:    #APP
7326; CHECK-NEXT:    nop
7327; CHECK-NEXT:    #NO_APP
7328; CHECK-NEXT:    kmovd %edi, %k1
7329; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7330; CHECK-NEXT:    retq
7331  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7332  %2 = xor <8 x i64> %a1, %a0
7333  %3 = bitcast i8 %mask to <8 x i1>
7334  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
7335  ret <8 x i64> %4
7336}
7337
7338declare <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
7339declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>)
7340declare <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
7341declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>)
7342declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>)
7343declare <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64>)
7344declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
7345declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1)
7346declare <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
7347declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>)
7348declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
7349declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>)
7350