xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll (revision 23f02693ec58efef6951fcaa689d26b440a25968)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 < %s | FileCheck %s
3
4declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
5declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
6declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
7declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
8declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
9declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
10declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
11declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
12declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
13declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
14declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
15declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
16
17define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
18; CHECK-LABEL: stack_fold_vpdpbssd:
19; CHECK:       # %bb.0:
20; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
21; CHECK-NEXT:    #APP
22; CHECK-NEXT:    nop
23; CHECK-NEXT:    #NO_APP
24; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
25; CHECK-NEXT:    retq
26  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
27  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
28  ret <4 x i32> %2
29}
30
31define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
32; CHECK-LABEL: stack_fold_vpdpbssd_commuted:
33; CHECK:       # %bb.0:
34; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
35; CHECK-NEXT:    #APP
36; CHECK-NEXT:    nop
37; CHECK-NEXT:    #NO_APP
38; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
39; CHECK-NEXT:    retq
40  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
41  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
42  ret <4 x i32> %2
43}
44
45define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
46; CHECK-LABEL: stack_fold_vpdpbssd_256:
47; CHECK:       # %bb.0:
48; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
49; CHECK-NEXT:    #APP
50; CHECK-NEXT:    nop
51; CHECK-NEXT:    #NO_APP
52; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
53; CHECK-NEXT:    retq
54  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
55  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
56  ret <8 x i32> %2
57}
58
59define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
60; CHECK-LABEL: stack_fold_vpdpbssd_256_commuted:
61; CHECK:       # %bb.0:
62; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
63; CHECK-NEXT:    #APP
64; CHECK-NEXT:    nop
65; CHECK-NEXT:    #NO_APP
66; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
67; CHECK-NEXT:    retq
68  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
69  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
70  ret <8 x i32> %2
71}
72
73define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
74; CHECK-LABEL: stack_fold_vpdpbssds:
75; CHECK:       # %bb.0:
76; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
77; CHECK-NEXT:    #APP
78; CHECK-NEXT:    nop
79; CHECK-NEXT:    #NO_APP
80; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
81; CHECK-NEXT:    retq
82  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
83  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
84  ret <4 x i32> %2
85}
86
87define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
88; CHECK-LABEL: stack_fold_vpdpbssds_commuted:
89; CHECK:       # %bb.0:
90; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
91; CHECK-NEXT:    #APP
92; CHECK-NEXT:    nop
93; CHECK-NEXT:    #NO_APP
94; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
95; CHECK-NEXT:    retq
96  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
97  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
98  ret <4 x i32> %2
99}
100
101define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
102; CHECK-LABEL: stack_fold_vpdpbssds_256:
103; CHECK:       # %bb.0:
104; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
105; CHECK-NEXT:    #APP
106; CHECK-NEXT:    nop
107; CHECK-NEXT:    #NO_APP
108; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
109; CHECK-NEXT:    retq
110  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
111  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
112  ret <8 x i32> %2
113}
114
115define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
116; CHECK-LABEL: stack_fold_vpdpbssds_256_commuted:
117; CHECK:       # %bb.0:
118; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
119; CHECK-NEXT:    #APP
120; CHECK-NEXT:    nop
121; CHECK-NEXT:    #NO_APP
122; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
123; CHECK-NEXT:    retq
124  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
125  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
126  ret <8 x i32> %2
127}
128
129define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
130; CHECK-LABEL: stack_fold_vpdpbsud:
131; CHECK:       # %bb.0:
132; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
133; CHECK-NEXT:    #APP
134; CHECK-NEXT:    nop
135; CHECK-NEXT:    #NO_APP
136; CHECK-NEXT:    vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
137; CHECK-NEXT:    retq
138  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
139  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
140  ret <4 x i32> %2
141}
142
143define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
144; CHECK-LABEL: stack_fold_vpdpbsud_commuted:
145; CHECK:       # %bb.0:
146; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
147; CHECK-NEXT:    #APP
148; CHECK-NEXT:    nop
149; CHECK-NEXT:    #NO_APP
150; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
151; CHECK-NEXT:    vpdpbsud %xmm1, %xmm2, %xmm0
152; CHECK-NEXT:    retq
153  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
154  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
155  ret <4 x i32> %2
156}
157
158define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
159; CHECK-LABEL: stack_fold_vpdpbsud_256:
160; CHECK:       # %bb.0:
161; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
162; CHECK-NEXT:    #APP
163; CHECK-NEXT:    nop
164; CHECK-NEXT:    #NO_APP
165; CHECK-NEXT:    vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
166; CHECK-NEXT:    retq
167  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
168  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
169  ret <8 x i32> %2
170}
171
172define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
173; CHECK-LABEL: stack_fold_vpdpbsud_256_commuted:
174; CHECK:       # %bb.0:
175; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
176; CHECK-NEXT:    #APP
177; CHECK-NEXT:    nop
178; CHECK-NEXT:    #NO_APP
179; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
180; CHECK-NEXT:    vpdpbsud %ymm1, %ymm2, %ymm0
181; CHECK-NEXT:    retq
182  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
183  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
184  ret <8 x i32> %2
185}
186
187define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
188; CHECK-LABEL: stack_fold_vpdpbsuds:
189; CHECK:       # %bb.0:
190; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
191; CHECK-NEXT:    #APP
192; CHECK-NEXT:    nop
193; CHECK-NEXT:    #NO_APP
194; CHECK-NEXT:    vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
195; CHECK-NEXT:    retq
196  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
197  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
198  ret <4 x i32> %2
199}
200
201define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
202; CHECK-LABEL: stack_fold_vpdpbsuds_commuted:
203; CHECK:       # %bb.0:
204; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
205; CHECK-NEXT:    #APP
206; CHECK-NEXT:    nop
207; CHECK-NEXT:    #NO_APP
208; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
209; CHECK-NEXT:    vpdpbsuds %xmm1, %xmm2, %xmm0
210; CHECK-NEXT:    retq
211  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
212  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
213  ret <4 x i32> %2
214}
215
216define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
217; CHECK-LABEL: stack_fold_vpdpbsuds_256:
218; CHECK:       # %bb.0:
219; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
220; CHECK-NEXT:    #APP
221; CHECK-NEXT:    nop
222; CHECK-NEXT:    #NO_APP
223; CHECK-NEXT:    vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
224; CHECK-NEXT:    retq
225  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
226  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
227  ret <8 x i32> %2
228}
229
230define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
231; CHECK-LABEL: stack_fold_vpdpbsuds_256_commuted:
232; CHECK:       # %bb.0:
233; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
234; CHECK-NEXT:    #APP
235; CHECK-NEXT:    nop
236; CHECK-NEXT:    #NO_APP
237; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
238; CHECK-NEXT:    vpdpbsuds %ymm1, %ymm2, %ymm0
239; CHECK-NEXT:    retq
240  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
241  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
242  ret <8 x i32> %2
243}
244
245define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
246; CHECK-LABEL: stack_fold_vpdpbuud:
247; CHECK:       # %bb.0:
248; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
249; CHECK-NEXT:    #APP
250; CHECK-NEXT:    nop
251; CHECK-NEXT:    #NO_APP
252; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
253; CHECK-NEXT:    retq
254  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
255  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
256  ret <4 x i32> %2
257}
258
259define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
260; CHECK-LABEL: stack_fold_vpdpbuud_commuted:
261; CHECK:       # %bb.0:
262; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
263; CHECK-NEXT:    #APP
264; CHECK-NEXT:    nop
265; CHECK-NEXT:    #NO_APP
266; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
267; CHECK-NEXT:    retq
268  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
269  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
270  ret <4 x i32> %2
271}
272
273define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
274; CHECK-LABEL: stack_fold_vpdpbuud_256:
275; CHECK:       # %bb.0:
276; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
277; CHECK-NEXT:    #APP
278; CHECK-NEXT:    nop
279; CHECK-NEXT:    #NO_APP
280; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
281; CHECK-NEXT:    retq
282  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
283  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
284  ret <8 x i32> %2
285}
286
287define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
288; CHECK-LABEL: stack_fold_vpdpbuud_256_commuted:
289; CHECK:       # %bb.0:
290; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
291; CHECK-NEXT:    #APP
292; CHECK-NEXT:    nop
293; CHECK-NEXT:    #NO_APP
294; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
295; CHECK-NEXT:    retq
296  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
297  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
298  ret <8 x i32> %2
299}
300
301define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
302; CHECK-LABEL: stack_fold_vpdpbuuds:
303; CHECK:       # %bb.0:
304; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
305; CHECK-NEXT:    #APP
306; CHECK-NEXT:    nop
307; CHECK-NEXT:    #NO_APP
308; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
309; CHECK-NEXT:    retq
310  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
311  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
312  ret <4 x i32> %2
313}
314
315define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
316; CHECK-LABEL: stack_fold_vpdpbuuds_commuted:
317; CHECK:       # %bb.0:
318; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
319; CHECK-NEXT:    #APP
320; CHECK-NEXT:    nop
321; CHECK-NEXT:    #NO_APP
322; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
323; CHECK-NEXT:    retq
324  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
325  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
326  ret <4 x i32> %2
327}
328
329define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
330; CHECK-LABEL: stack_fold_vpdpbuuds_256:
331; CHECK:       # %bb.0:
332; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
333; CHECK-NEXT:    #APP
334; CHECK-NEXT:    nop
335; CHECK-NEXT:    #NO_APP
336; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
337; CHECK-NEXT:    retq
338  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
339  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
340  ret <8 x i32> %2
341}
342
343define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
344; CHECK-LABEL: stack_fold_vpdpbuuds_256_commuted:
345; CHECK:       # %bb.0:
346; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
347; CHECK-NEXT:    #APP
348; CHECK-NEXT:    nop
349; CHECK-NEXT:    #NO_APP
350; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
351; CHECK-NEXT:    retq
352  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
353  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
354  ret <8 x i32> %2
355}
356