xref: /llvm-project/llvm/test/CodeGen/X86/pmullq-false-deps.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
3; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
4
5define <2 x i64> @pmullq_128(<2 x i64> %a0, <2 x i64> %a1) {
6; ENABLE-LABEL: pmullq_128:
7; ENABLE:       # %bb.0:
8; ENABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9; ENABLE-NEXT:    #APP
10; ENABLE-NEXT:    nop
11; ENABLE-NEXT:    #NO_APP
12; ENABLE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
14; ENABLE-NEXT:    vpmullq %xmm2, %xmm0, %xmm1
15; ENABLE-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
16; ENABLE-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
17; ENABLE-NEXT:    retq
18;
19; DISABLE-LABEL: pmullq_128:
20; DISABLE:       # %bb.0:
21; DISABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
22; DISABLE-NEXT:    #APP
23; DISABLE-NEXT:    nop
24; DISABLE-NEXT:    #NO_APP
25; DISABLE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
26; DISABLE-NEXT:    vpmullq %xmm2, %xmm0, %xmm1
27; DISABLE-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
28; DISABLE-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
29; DISABLE-NEXT:    retq
30  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
31  %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
32  %3 = add <2 x i64> %a0, %a1
33  %res = add <2 x i64> %2, %3
34  ret <2 x i64> %res
35}
36
37define <2 x i64> @pmullq_mem_128(<2 x i64> %a0, ptr %p1) {
38; ENABLE-LABEL: pmullq_mem_128:
39; ENABLE:       # %bb.0:
40; ENABLE-NEXT:    #APP
41; ENABLE-NEXT:    nop
42; ENABLE-NEXT:    #NO_APP
43; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
44; ENABLE-NEXT:    vpmullq (%rdi), %xmm0, %xmm1
45; ENABLE-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
46; ENABLE-NEXT:    retq
47;
48; DISABLE-LABEL: pmullq_mem_128:
49; DISABLE:       # %bb.0:
50; DISABLE-NEXT:    #APP
51; DISABLE-NEXT:    nop
52; DISABLE-NEXT:    #NO_APP
53; DISABLE-NEXT:    vpmullq (%rdi), %xmm0, %xmm1
54; DISABLE-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
55; DISABLE-NEXT:    retq
56  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
57  %a1 = load <2 x i64>, ptr %p1, align 64
58  %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
59  %res = add <2 x i64> %2, %a0
60  ret <2 x i64> %res
61}
62
63define <2 x i64> @pmullq_broadcast_128(<2 x i64> %a0, ptr %p1) {
64; ENABLE-LABEL: pmullq_broadcast_128:
65; ENABLE:       # %bb.0:
66; ENABLE-NEXT:    #APP
67; ENABLE-NEXT:    nop
68; ENABLE-NEXT:    #NO_APP
69; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
70; ENABLE-NEXT:    vpmullq (%rdi){1to2}, %xmm0, %xmm1
71; ENABLE-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
72; ENABLE-NEXT:    retq
73;
74; DISABLE-LABEL: pmullq_broadcast_128:
75; DISABLE:       # %bb.0:
76; DISABLE-NEXT:    #APP
77; DISABLE-NEXT:    nop
78; DISABLE-NEXT:    #NO_APP
79; DISABLE-NEXT:    vpmullq (%rdi){1to2}, %xmm0, %xmm1
80; DISABLE-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
81; DISABLE-NEXT:    retq
82  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
83  %v1 = load i64, ptr %p1, align 4
84  %t0 = insertelement <2 x i64> undef, i64 %v1, i64 0
85  %a1 = shufflevector <2 x i64> %t0, <2 x i64> undef, <2 x i32> zeroinitializer
86  %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
87  %res = add <2 x i64> %2, %a0
88  ret <2 x i64> %res
89}
90
91define <2 x i64> @pmullq_maskz_128(<2 x i64> %a0, <2 x i64> %a1, ptr %pmask) {
92; ENABLE-LABEL: pmullq_maskz_128:
93; ENABLE:       # %bb.0:
94; ENABLE-NEXT:    vpmullq %xmm1, %xmm0, %xmm2
95; ENABLE-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
96; ENABLE-NEXT:    #APP
97; ENABLE-NEXT:    nop
98; ENABLE-NEXT:    #NO_APP
99; ENABLE-NEXT:    kmovb (%rdi), %k1
100; ENABLE-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
101; ENABLE-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload
102; ENABLE-NEXT:    retq
103;
104; DISABLE-LABEL: pmullq_maskz_128:
105; DISABLE:       # %bb.0:
106; DISABLE-NEXT:    vpmullq %xmm1, %xmm0, %xmm2
107; DISABLE-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
108; DISABLE-NEXT:    #APP
109; DISABLE-NEXT:    nop
110; DISABLE-NEXT:    #NO_APP
111; DISABLE-NEXT:    kmovb (%rdi), %k1
112; DISABLE-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
113; DISABLE-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload
114; DISABLE-NEXT:    retq
115  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
116  %mask = load i8, ptr %pmask
117  %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> zeroinitializer, i8 %mask)
118  %3 = add <2 x i64> %a0, %a1
119  %res = add <2 x i64> %2, %3
120  ret <2 x i64> %res
121}
122
123declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
124
125define <4 x i64> @pmullq_256(<4 x i64> %a0, <4 x i64> %a1) {
126; ENABLE-LABEL: pmullq_256:
127; ENABLE:       # %bb.0:
128; ENABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
129; ENABLE-NEXT:    #APP
130; ENABLE-NEXT:    nop
131; ENABLE-NEXT:    #NO_APP
132; ENABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
133; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
134; ENABLE-NEXT:    vpmullq %ymm2, %ymm0, %ymm1
135; ENABLE-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
136; ENABLE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
137; ENABLE-NEXT:    retq
138;
139; DISABLE-LABEL: pmullq_256:
140; DISABLE:       # %bb.0:
141; DISABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
142; DISABLE-NEXT:    #APP
143; DISABLE-NEXT:    nop
144; DISABLE-NEXT:    #NO_APP
145; DISABLE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
146; DISABLE-NEXT:    vpmullq %ymm2, %ymm0, %ymm1
147; DISABLE-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
148; DISABLE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
149; DISABLE-NEXT:    retq
150  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
151  %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
152  %3 = add <4 x i64> %a0, %a1
153  %res = add <4 x i64> %2, %3
154  ret <4 x i64> %res
155}
156
157define <4 x i64> @pmullq_mem_256(<4 x i64> %a0, ptr %p1) {
158; ENABLE-LABEL: pmullq_mem_256:
159; ENABLE:       # %bb.0:
160; ENABLE-NEXT:    #APP
161; ENABLE-NEXT:    nop
162; ENABLE-NEXT:    #NO_APP
163; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
164; ENABLE-NEXT:    vpmullq (%rdi), %ymm0, %ymm1
165; ENABLE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
166; ENABLE-NEXT:    retq
167;
168; DISABLE-LABEL: pmullq_mem_256:
169; DISABLE:       # %bb.0:
170; DISABLE-NEXT:    #APP
171; DISABLE-NEXT:    nop
172; DISABLE-NEXT:    #NO_APP
173; DISABLE-NEXT:    vpmullq (%rdi), %ymm0, %ymm1
174; DISABLE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
175; DISABLE-NEXT:    retq
176  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
177  %a1 = load <4 x i64>, ptr %p1, align 64
178  %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
179  %res = add <4 x i64> %2, %a0
180  ret <4 x i64> %res
181}
182
183define <4 x i64> @pmullq_broadcast_256(<4 x i64> %a0, ptr %p1) {
184; ENABLE-LABEL: pmullq_broadcast_256:
185; ENABLE:       # %bb.0:
186; ENABLE-NEXT:    #APP
187; ENABLE-NEXT:    nop
188; ENABLE-NEXT:    #NO_APP
189; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
190; ENABLE-NEXT:    vpmullq (%rdi){1to4}, %ymm0, %ymm1
191; ENABLE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
192; ENABLE-NEXT:    retq
193;
194; DISABLE-LABEL: pmullq_broadcast_256:
195; DISABLE:       # %bb.0:
196; DISABLE-NEXT:    #APP
197; DISABLE-NEXT:    nop
198; DISABLE-NEXT:    #NO_APP
199; DISABLE-NEXT:    vpmullq (%rdi){1to4}, %ymm0, %ymm1
200; DISABLE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
201; DISABLE-NEXT:    retq
202  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
203  %v1 = load i64, ptr %p1, align 4
204  %t0 = insertelement <4 x i64> undef, i64 %v1, i64 0
205  %a1 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer
206  %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
207  %res = add <4 x i64> %2, %a0
208  ret <4 x i64> %res
209}
210
211define <4 x i64> @pmullq_maskz_256(<4 x i64> %a0, <4 x i64> %a1, ptr %pmask) {
212; ENABLE-LABEL: pmullq_maskz_256:
213; ENABLE:       # %bb.0:
214; ENABLE-NEXT:    vpmullq %ymm1, %ymm0, %ymm2
215; ENABLE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
216; ENABLE-NEXT:    #APP
217; ENABLE-NEXT:    nop
218; ENABLE-NEXT:    #NO_APP
219; ENABLE-NEXT:    kmovb (%rdi), %k1
220; ENABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
221; ENABLE-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload
222; ENABLE-NEXT:    retq
223;
224; DISABLE-LABEL: pmullq_maskz_256:
225; DISABLE:       # %bb.0:
226; DISABLE-NEXT:    vpmullq %ymm1, %ymm0, %ymm2
227; DISABLE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
228; DISABLE-NEXT:    #APP
229; DISABLE-NEXT:    nop
230; DISABLE-NEXT:    #NO_APP
231; DISABLE-NEXT:    kmovb (%rdi), %k1
232; DISABLE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
233; DISABLE-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload
234; DISABLE-NEXT:    retq
235  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
236  %mask = load i8, ptr %pmask
237  %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> zeroinitializer, i8 %mask)
238  %3 = add <4 x i64> %a0, %a1
239  %res = add <4 x i64> %2, %3
240  ret <4 x i64> %res
241}
242
243declare <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
244
245define <8 x i64> @pmullq_512(<8 x i64> %a0, <8 x i64> %a1) {
246; ENABLE-LABEL: pmullq_512:
247; ENABLE:       # %bb.0:
248; ENABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
249; ENABLE-NEXT:    #APP
250; ENABLE-NEXT:    nop
251; ENABLE-NEXT:    #NO_APP
252; ENABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
253; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
254; ENABLE-NEXT:    vpmullq %zmm2, %zmm0, %zmm1
255; ENABLE-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
256; ENABLE-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
257; ENABLE-NEXT:    retq
258;
259; DISABLE-LABEL: pmullq_512:
260; DISABLE:       # %bb.0:
261; DISABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
262; DISABLE-NEXT:    #APP
263; DISABLE-NEXT:    nop
264; DISABLE-NEXT:    #NO_APP
265; DISABLE-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
266; DISABLE-NEXT:    vpmullq %zmm2, %zmm0, %zmm1
267; DISABLE-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
268; DISABLE-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
269; DISABLE-NEXT:    retq
270  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
271  %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1)
272  %3 = add <8 x i64> %a0, %a1
273  %res = add <8 x i64> %2, %3
274  ret <8 x i64> %res
275}
276
277define <8 x i64> @pmullq_mem_512(<8 x i64> %a0, ptr %p1) {
278; ENABLE-LABEL: pmullq_mem_512:
279; ENABLE:       # %bb.0:
280; ENABLE-NEXT:    #APP
281; ENABLE-NEXT:    nop
282; ENABLE-NEXT:    #NO_APP
283; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
284; ENABLE-NEXT:    vpmullq (%rdi), %zmm0, %zmm1
285; ENABLE-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
286; ENABLE-NEXT:    retq
287;
288; DISABLE-LABEL: pmullq_mem_512:
289; DISABLE:       # %bb.0:
290; DISABLE-NEXT:    #APP
291; DISABLE-NEXT:    nop
292; DISABLE-NEXT:    #NO_APP
293; DISABLE-NEXT:    vpmullq (%rdi), %zmm0, %zmm1
294; DISABLE-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
295; DISABLE-NEXT:    retq
296  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
297  %a1 = load <8 x i64>, ptr %p1, align 64
298  %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1)
299  %res = add <8 x i64> %2, %a0
300  ret <8 x i64> %res
301}
302
303define <8 x i64> @pmullq_broadcast_512(<8 x i64> %a0, ptr %p1) {
304; ENABLE-LABEL: pmullq_broadcast_512:
305; ENABLE:       # %bb.0:
306; ENABLE-NEXT:    #APP
307; ENABLE-NEXT:    nop
308; ENABLE-NEXT:    #NO_APP
309; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
310; ENABLE-NEXT:    vpmullq (%rdi){1to8}, %zmm0, %zmm1
311; ENABLE-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
312; ENABLE-NEXT:    retq
313;
314; DISABLE-LABEL: pmullq_broadcast_512:
315; DISABLE:       # %bb.0:
316; DISABLE-NEXT:    #APP
317; DISABLE-NEXT:    nop
318; DISABLE-NEXT:    #NO_APP
319; DISABLE-NEXT:    vpmullq (%rdi){1to8}, %zmm0, %zmm1
320; DISABLE-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
321; DISABLE-NEXT:    retq
322  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
323  %v1 = load i64, ptr %p1, align 4
324  %t0 = insertelement <8 x i64> undef, i64 %v1, i64 0
325  %a1 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer
326  %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1)
327  %res = add <8 x i64> %2, %a0
328  ret <8 x i64> %res
329}
330
331define <8 x i64> @pmullq_maskz_512(<8 x i64> %a0, <8 x i64> %a1, ptr %pmask) {
332; ENABLE-LABEL: pmullq_maskz_512:
333; ENABLE:       # %bb.0:
334; ENABLE-NEXT:    vpmullq %zmm1, %zmm0, %zmm2
335; ENABLE-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
336; ENABLE-NEXT:    #APP
337; ENABLE-NEXT:    nop
338; ENABLE-NEXT:    #NO_APP
339; ENABLE-NEXT:    kmovb (%rdi), %k1
340; ENABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
341; ENABLE-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
342; ENABLE-NEXT:    retq
343;
344; DISABLE-LABEL: pmullq_maskz_512:
345; DISABLE:       # %bb.0:
346; DISABLE-NEXT:    vpmullq %zmm1, %zmm0, %zmm2
347; DISABLE-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
348; DISABLE-NEXT:    #APP
349; DISABLE-NEXT:    nop
350; DISABLE-NEXT:    #NO_APP
351; DISABLE-NEXT:    kmovb (%rdi), %k1
352; DISABLE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
353; DISABLE-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
354; DISABLE-NEXT:    retq
355  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
356  %mask = load i8, ptr %pmask
357  %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
358  %3 = add <8 x i64> %a0, %a1
359  %res = add <8 x i64> %2, %3
360  ret <8 x i64> %res
361}
362
363declare <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
364