xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-int-avx512ifma.ll (revision 0e720e6adad13d9a3d29dc41e5c62240047acf55)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl < %s | FileCheck %s
3
4declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
5declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
6
7define <8 x i64> @stack_fold_vpmadd52huq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
8; CHECK-LABEL: stack_fold_vpmadd52huq:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11; CHECK-NEXT:    #APP
12; CHECK-NEXT:    nop
13; CHECK-NEXT:    #NO_APP
14; CHECK-NEXT:    vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
15; CHECK-NEXT:    retq
16  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
17  %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
18  ret <8 x i64> %2
19}
20
21define <8 x i64> @stack_fold_vpmadd52huq_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
22; CHECK-LABEL: stack_fold_vpmadd52huq_commuted:
23; CHECK:       # %bb.0:
24; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25; CHECK-NEXT:    #APP
26; CHECK-NEXT:    nop
27; CHECK-NEXT:    #NO_APP
28; CHECK-NEXT:    vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
29; CHECK-NEXT:    retq
30  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
31  %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
32  ret <8 x i64> %2
33}
34
35define <8 x i64> @stack_fold_vpmadd52huq_mask(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
36; CHECK-LABEL: stack_fold_vpmadd52huq_mask:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
39; CHECK-NEXT:    #APP
40; CHECK-NEXT:    nop
41; CHECK-NEXT:    #NO_APP
42; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
43; CHECK-NEXT:    kmovw %esi, %k1
44; CHECK-NEXT:    vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
45; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
46; CHECK-NEXT:    retq
47  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
48  %2 = load <8 x i64>, ptr %a0
49  %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %2, <8 x i64> %a1, <8 x i64> %a2)
50  %4 = bitcast i8 %mask to <8 x i1>
51  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
52  ret <8 x i64> %5
53}
54
55define <8 x i64> @stack_fold_vpmadd52huq_mask_commuted(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
56; CHECK-LABEL: stack_fold_vpmadd52huq_mask_commuted:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
59; CHECK-NEXT:    #APP
60; CHECK-NEXT:    nop
61; CHECK-NEXT:    #NO_APP
62; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
63; CHECK-NEXT:    kmovw %esi, %k1
64; CHECK-NEXT:    vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
65; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
66; CHECK-NEXT:    retq
67  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
68  %2 = load <8 x i64>, ptr %a0
69  %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %2, <8 x i64> %a2, <8 x i64> %a1)
70  %4 = bitcast i8 %mask to <8 x i1>
71  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
72  ret <8 x i64> %5
73}
74
75define <8 x i64> @stack_fold_vpmadd52huq_maskz(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
76; CHECK-LABEL: stack_fold_vpmadd52huq_maskz:
77; CHECK:       # %bb.0:
78; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
79; CHECK-NEXT:    #APP
80; CHECK-NEXT:    nop
81; CHECK-NEXT:    #NO_APP
82; CHECK-NEXT:    movzbl (%rdi), %eax
83; CHECK-NEXT:    kmovw %eax, %k1
84; CHECK-NEXT:    vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
85; CHECK-NEXT:    retq
86  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
87  %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
88  %3 = load i8, ptr %mask
89  %4 = bitcast i8 %3 to <8 x i1>
90  %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
91  ret <8 x i64> %5
92}
93
94define <8 x i64> @stack_fold_vpmadd52huq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
95; CHECK-LABEL: stack_fold_vpmadd52huq_maskz_commuted:
96; CHECK:       # %bb.0:
97; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
98; CHECK-NEXT:    #APP
99; CHECK-NEXT:    nop
100; CHECK-NEXT:    #NO_APP
101; CHECK-NEXT:    movzbl (%rdi), %eax
102; CHECK-NEXT:    kmovw %eax, %k1
103; CHECK-NEXT:    vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
104; CHECK-NEXT:    retq
105  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
106  %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
107  %3 = load i8, ptr %mask
108  %4 = bitcast i8 %3 to <8 x i1>
109  %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
110  ret <8 x i64> %5
111}
112
113define <8 x i64> @stack_fold_vpmadd52luq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
114; CHECK-LABEL: stack_fold_vpmadd52luq:
115; CHECK:       # %bb.0:
116; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
117; CHECK-NEXT:    #APP
118; CHECK-NEXT:    nop
119; CHECK-NEXT:    #NO_APP
120; CHECK-NEXT:    vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
121; CHECK-NEXT:    retq
122  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
123  %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
124  ret <8 x i64> %2
125}
126
127define <8 x i64> @stack_fold_vpmadd52luq_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
128; CHECK-LABEL: stack_fold_vpmadd52luq_commuted:
129; CHECK:       # %bb.0:
130; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
131; CHECK-NEXT:    #APP
132; CHECK-NEXT:    nop
133; CHECK-NEXT:    #NO_APP
134; CHECK-NEXT:    vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
135; CHECK-NEXT:    retq
136  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
137  %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
138  ret <8 x i64> %2
139}
140
141define <8 x i64> @stack_fold_vpmadd52luq_mask(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
142; CHECK-LABEL: stack_fold_vpmadd52luq_mask:
143; CHECK:       # %bb.0:
144; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
145; CHECK-NEXT:    #APP
146; CHECK-NEXT:    nop
147; CHECK-NEXT:    #NO_APP
148; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
149; CHECK-NEXT:    kmovw %esi, %k1
150; CHECK-NEXT:    vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
151; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
152; CHECK-NEXT:    retq
153  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
154  %2 = load <8 x i64>, ptr %a0
155  %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %2, <8 x i64> %a1, <8 x i64> %a2)
156  %4 = bitcast i8 %mask to <8 x i1>
157  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
158  ret <8 x i64> %5
159}
160
161define <8 x i64> @stack_fold_vpmadd52luq_mask_commuted(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
162; CHECK-LABEL: stack_fold_vpmadd52luq_mask_commuted:
163; CHECK:       # %bb.0:
164; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
165; CHECK-NEXT:    #APP
166; CHECK-NEXT:    nop
167; CHECK-NEXT:    #NO_APP
168; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
169; CHECK-NEXT:    kmovw %esi, %k1
170; CHECK-NEXT:    vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
171; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
172; CHECK-NEXT:    retq
173  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
174  %2 = load <8 x i64>, ptr %a0
175  %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %2, <8 x i64> %a2, <8 x i64> %a1)
176  %4 = bitcast i8 %mask to <8 x i1>
177  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
178  ret <8 x i64> %5
179}
180
181define <8 x i64> @stack_fold_vpmadd52luq_maskz(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
182; CHECK-LABEL: stack_fold_vpmadd52luq_maskz:
183; CHECK:       # %bb.0:
184; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
185; CHECK-NEXT:    #APP
186; CHECK-NEXT:    nop
187; CHECK-NEXT:    #NO_APP
188; CHECK-NEXT:    movzbl (%rdi), %eax
189; CHECK-NEXT:    kmovw %eax, %k1
190; CHECK-NEXT:    vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
191; CHECK-NEXT:    retq
192  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
193  %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
194  %3 = load i8, ptr %mask
195  %4 = bitcast i8 %3 to <8 x i1>
196  %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
197  ret <8 x i64> %5
198}
199
200define <8 x i64> @stack_fold_vpmadd52luq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
201; CHECK-LABEL: stack_fold_vpmadd52luq_maskz_commuted:
202; CHECK:       # %bb.0:
203; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
204; CHECK-NEXT:    #APP
205; CHECK-NEXT:    nop
206; CHECK-NEXT:    #NO_APP
207; CHECK-NEXT:    movzbl (%rdi), %eax
208; CHECK-NEXT:    kmovw %eax, %k1
209; CHECK-NEXT:    vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
210; CHECK-NEXT:    retq
211  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
212  %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
213  %3 = load i8, ptr %mask
214  %4 = bitcast i8 %3 to <8 x i1>
215  %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
216  ret <8 x i64> %5
217}
218