xref: /llvm-project/llvm/test/CodeGen/X86/mulc-false-deps.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-mulc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
3; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-mulc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
4
5define <16 x float> @fmulcph(<16 x float> %a0, <16 x float> %a1) {
6; ENABLE-LABEL: fmulcph:
7; ENABLE:       # %bb.0:
8; ENABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9; ENABLE-NEXT:    #APP
10; ENABLE-NEXT:    nop
11; ENABLE-NEXT:    #NO_APP
12; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13; ENABLE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
14; ENABLE-NEXT:    vfmulcph %zmm1, %zmm0, %zmm2
15; ENABLE-NEXT:    vmovaps %zmm2, %zmm0
16; ENABLE-NEXT:    retq
17;
18; DISABLE-LABEL: fmulcph:
19; DISABLE:       # %bb.0:
20; DISABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21; DISABLE-NEXT:    #APP
22; DISABLE-NEXT:    nop
23; DISABLE-NEXT:    #NO_APP
24; DISABLE-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
25; DISABLE-NEXT:    vmovaps %zmm2, %zmm0
26; DISABLE-NEXT:    retq
27  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
28  %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
29  ret <16 x float> %2
30}
31
32define <16 x float> @fmulcph_mem(<16 x float> %a0, ptr %p1) {
33; ENABLE-LABEL: fmulcph_mem:
34; ENABLE:       # %bb.0:
35; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
36; ENABLE-NEXT:    #APP
37; ENABLE-NEXT:    nop
38; ENABLE-NEXT:    #NO_APP
39; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
40; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
41; ENABLE-NEXT:    vfmulcph (%rdi), %zmm0, %zmm1
42; ENABLE-NEXT:    vmovaps %zmm1, %zmm0
43; ENABLE-NEXT:    retq
44;
45; DISABLE-LABEL: fmulcph_mem:
46; DISABLE:       # %bb.0:
47; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
48; DISABLE-NEXT:    #APP
49; DISABLE-NEXT:    nop
50; DISABLE-NEXT:    #NO_APP
51; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
52; DISABLE-NEXT:    vfmulcph (%rdi), %zmm0, %zmm1
53; DISABLE-NEXT:    vmovaps %zmm1, %zmm0
54; DISABLE-NEXT:    retq
55  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
56  %a1 = load <16 x float>, ptr %p1, align 64
57  %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
58  ret <16 x float> %2
59}
60
61define <16 x float> @fmulcph_broadcast(<16 x float> %a0, ptr %p1) {
62; ENABLE-LABEL: fmulcph_broadcast:
63; ENABLE:       # %bb.0:
64; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
65; ENABLE-NEXT:    #APP
66; ENABLE-NEXT:    nop
67; ENABLE-NEXT:    #NO_APP
68; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
69; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
70; ENABLE-NEXT:    vfmulcph (%rdi){1to16}, %zmm0, %zmm1
71; ENABLE-NEXT:    vmovaps %zmm1, %zmm0
72; ENABLE-NEXT:    retq
73;
74; DISABLE-LABEL: fmulcph_broadcast:
75; DISABLE:       # %bb.0:
76; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
77; DISABLE-NEXT:    #APP
78; DISABLE-NEXT:    nop
79; DISABLE-NEXT:    #NO_APP
80; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
81; DISABLE-NEXT:    vfmulcph (%rdi){1to16}, %zmm0, %zmm1
82; DISABLE-NEXT:    vmovaps %zmm1, %zmm0
83; DISABLE-NEXT:    retq
84  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
85  %v1 = load float, ptr %p1, align 4
86  %t0 = insertelement <16 x float> undef, float %v1, i64 0
87  %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
88  %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
89  ret <16 x float> %2
90}
91
92define <16 x float> @fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
93; ENABLE-LABEL: fmulcph_maskz:
94; ENABLE:       # %bb.0:
95; ENABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
96; ENABLE-NEXT:    #APP
97; ENABLE-NEXT:    nop
98; ENABLE-NEXT:    #NO_APP
99; ENABLE-NEXT:    kmovw (%rdi), %k1
100; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
101; ENABLE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
102; ENABLE-NEXT:    vfmulcph %zmm1, %zmm0, %zmm2 {%k1} {z}
103; ENABLE-NEXT:    vmovaps %zmm2, %zmm0
104; ENABLE-NEXT:    retq
105;
106; DISABLE-LABEL: fmulcph_maskz:
107; DISABLE:       # %bb.0:
108; DISABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
109; DISABLE-NEXT:    #APP
110; DISABLE-NEXT:    nop
111; DISABLE-NEXT:    #NO_APP
112; DISABLE-NEXT:    kmovw (%rdi), %k1
113; DISABLE-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
114; DISABLE-NEXT:    vmovaps %zmm2, %zmm0
115; DISABLE-NEXT:    retq
116  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
117  %2 = load i16, ptr %mask
118  %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
119  ret <16 x float> %3
120}
121
122define <16 x float> @fcmulcph(<16 x float> %a0, <16 x float> %a1) {
123; ENABLE-LABEL: fcmulcph:
124; ENABLE:       # %bb.0:
125; ENABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
126; ENABLE-NEXT:    #APP
127; ENABLE-NEXT:    nop
128; ENABLE-NEXT:    #NO_APP
129; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
130; ENABLE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
131; ENABLE-NEXT:    vfcmulcph %zmm1, %zmm0, %zmm2
132; ENABLE-NEXT:    vmovaps %zmm2, %zmm0
133; ENABLE-NEXT:    retq
134;
135; DISABLE-LABEL: fcmulcph:
136; DISABLE:       # %bb.0:
137; DISABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
138; DISABLE-NEXT:    #APP
139; DISABLE-NEXT:    nop
140; DISABLE-NEXT:    #NO_APP
141; DISABLE-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
142; DISABLE-NEXT:    vmovaps %zmm2, %zmm0
143; DISABLE-NEXT:    retq
144  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
145  %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
146  ret <16 x float> %2
147}
148
149define <16 x float> @fcmulcph_mem(<16 x float> %a0, ptr %p1) {
150; ENABLE-LABEL: fcmulcph_mem:
151; ENABLE:       # %bb.0:
152; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
153; ENABLE-NEXT:    #APP
154; ENABLE-NEXT:    nop
155; ENABLE-NEXT:    #NO_APP
156; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
157; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
158; ENABLE-NEXT:    vfcmulcph (%rdi), %zmm0, %zmm1
159; ENABLE-NEXT:    vmovaps %zmm1, %zmm0
160; ENABLE-NEXT:    retq
161;
162; DISABLE-LABEL: fcmulcph_mem:
163; DISABLE:       # %bb.0:
164; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
165; DISABLE-NEXT:    #APP
166; DISABLE-NEXT:    nop
167; DISABLE-NEXT:    #NO_APP
168; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
169; DISABLE-NEXT:    vfcmulcph (%rdi), %zmm0, %zmm1
170; DISABLE-NEXT:    vmovaps %zmm1, %zmm0
171; DISABLE-NEXT:    retq
172  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
173  %a1 = load <16 x float>, ptr %p1, align 64
174  %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
175  ret <16 x float> %2
176}
177
178define <16 x float> @fcmulcph_broadcast(<16 x float> %a0, ptr %p1) {
179; ENABLE-LABEL: fcmulcph_broadcast:
180; ENABLE:       # %bb.0:
181; ENABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
182; ENABLE-NEXT:    #APP
183; ENABLE-NEXT:    nop
184; ENABLE-NEXT:    #NO_APP
185; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
186; ENABLE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
187; ENABLE-NEXT:    vfcmulcph (%rdi){1to16}, %zmm0, %zmm1
188; ENABLE-NEXT:    vmovaps %zmm1, %zmm0
189; ENABLE-NEXT:    retq
190;
191; DISABLE-LABEL: fcmulcph_broadcast:
192; DISABLE:       # %bb.0:
193; DISABLE-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
194; DISABLE-NEXT:    #APP
195; DISABLE-NEXT:    nop
196; DISABLE-NEXT:    #NO_APP
197; DISABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
198; DISABLE-NEXT:    vfcmulcph (%rdi){1to16}, %zmm0, %zmm1
199; DISABLE-NEXT:    vmovaps %zmm1, %zmm0
200; DISABLE-NEXT:    retq
201  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
202  %v1 = load float, ptr %p1, align 4
203  %t0 = insertelement <16 x float> undef, float %v1, i64 0
204  %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
205  %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
206  ret <16 x float> %2
207}
208
209define <16 x float> @fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
210; ENABLE-LABEL: fcmulcph_maskz:
211; ENABLE:       # %bb.0:
212; ENABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
213; ENABLE-NEXT:    #APP
214; ENABLE-NEXT:    nop
215; ENABLE-NEXT:    #NO_APP
216; ENABLE-NEXT:    kmovw (%rdi), %k1
217; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
218; ENABLE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
219; ENABLE-NEXT:    vfcmulcph %zmm1, %zmm0, %zmm2 {%k1} {z}
220; ENABLE-NEXT:    vmovaps %zmm2, %zmm0
221; ENABLE-NEXT:    retq
222;
223; DISABLE-LABEL: fcmulcph_maskz:
224; DISABLE:       # %bb.0:
225; DISABLE-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
226; DISABLE-NEXT:    #APP
227; DISABLE-NEXT:    nop
228; DISABLE-NEXT:    #NO_APP
229; DISABLE-NEXT:    kmovw (%rdi), %k1
230; DISABLE-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
231; DISABLE-NEXT:    vmovaps %zmm2, %zmm0
232; DISABLE-NEXT:    retq
233  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
234  %2 = load i16, ptr %mask
235  %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
236  ret <16 x float> %3
237}
238
239define <4 x float> @fmulc(<4 x float> %a0, <4 x float> %a1) {
240; ENABLE-LABEL: fmulc:
241; ENABLE:       # %bb.0:
242; ENABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
243; ENABLE-NEXT:    #APP
244; ENABLE-NEXT:    nop
245; ENABLE-NEXT:    #NO_APP
246; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
247; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
248; ENABLE-NEXT:    vfmulcph %xmm1, %xmm0, %xmm2
249; ENABLE-NEXT:    vmovaps %xmm2, %xmm0
250; ENABLE-NEXT:    retq
251;
252; DISABLE-LABEL: fmulc:
253; DISABLE:       # %bb.0:
254; DISABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
255; DISABLE-NEXT:    #APP
256; DISABLE-NEXT:    nop
257; DISABLE-NEXT:    #NO_APP
258; DISABLE-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
259; DISABLE-NEXT:    vmovaps %xmm2, %xmm0
260; DISABLE-NEXT:    retq
261  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
262  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
263  ret <4 x float> %2
264}
265
266define <4 x float> @fmulc_mem(<4 x float> %a0, ptr %p1) {
267; ENABLE-LABEL: fmulc_mem:
268; ENABLE:       # %bb.0:
269; ENABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
270; ENABLE-NEXT:    #APP
271; ENABLE-NEXT:    nop
272; ENABLE-NEXT:    #NO_APP
273; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
274; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
275; ENABLE-NEXT:    vfmulcph (%rdi), %xmm0, %xmm1
276; ENABLE-NEXT:    vmovaps %xmm1, %xmm0
277; ENABLE-NEXT:    retq
278;
279; DISABLE-LABEL: fmulc_mem:
280; DISABLE:       # %bb.0:
281; DISABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
282; DISABLE-NEXT:    #APP
283; DISABLE-NEXT:    nop
284; DISABLE-NEXT:    #NO_APP
285; DISABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
286; DISABLE-NEXT:    vfmulcph (%rdi), %xmm0, %xmm1
287; DISABLE-NEXT:    vmovaps %xmm1, %xmm0
288; DISABLE-NEXT:    retq
289  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
290  %a1 = load <4 x float>, ptr %p1, align 64
291  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
292  ret <4 x float> %2
293}
294
295define <4 x float> @fmulc_broadcast(<4 x float> %a0, ptr %p1) {
296; ENABLE-LABEL: fmulc_broadcast:
297; ENABLE:       # %bb.0:
298; ENABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
299; ENABLE-NEXT:    #APP
300; ENABLE-NEXT:    nop
301; ENABLE-NEXT:    #NO_APP
302; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
303; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
304; ENABLE-NEXT:    vfmulcph (%rdi){1to4}, %xmm0, %xmm1
305; ENABLE-NEXT:    vmovaps %xmm1, %xmm0
306; ENABLE-NEXT:    retq
307;
308; DISABLE-LABEL: fmulc_broadcast:
309; DISABLE:       # %bb.0:
310; DISABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
311; DISABLE-NEXT:    #APP
312; DISABLE-NEXT:    nop
313; DISABLE-NEXT:    #NO_APP
314; DISABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
315; DISABLE-NEXT:    vfmulcph (%rdi){1to4}, %xmm0, %xmm1
316; DISABLE-NEXT:    vmovaps %xmm1, %xmm0
317; DISABLE-NEXT:    retq
318  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
319  %v1 = load float, ptr %p1, align 4
320  %t0 = insertelement <4 x float> undef, float %v1, i64 0
321  %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer
322  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
323  ret <4 x float> %2
324}
325
326define <4 x float> @fmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
327; ENABLE-LABEL: fmulc_maskz:
328; ENABLE:       # %bb.0:
329; ENABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
330; ENABLE-NEXT:    #APP
331; ENABLE-NEXT:    nop
332; ENABLE-NEXT:    #NO_APP
333; ENABLE-NEXT:    kmovb (%rdi), %k1
334; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
335; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
336; ENABLE-NEXT:    vfmulcph %xmm1, %xmm0, %xmm2 {%k1} {z}
337; ENABLE-NEXT:    vmovaps %xmm2, %xmm0
338; ENABLE-NEXT:    retq
339;
340; DISABLE-LABEL: fmulc_maskz:
341; DISABLE:       # %bb.0:
342; DISABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
343; DISABLE-NEXT:    #APP
344; DISABLE-NEXT:    nop
345; DISABLE-NEXT:    #NO_APP
346; DISABLE-NEXT:    kmovb (%rdi), %k1
347; DISABLE-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
348; DISABLE-NEXT:    vmovaps %xmm2, %xmm0
349; DISABLE-NEXT:    retq
350
351  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
352  %2 = load i8, ptr %mask
353  %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
354  ret <4 x float> %3
355}
356
357define <4 x float> @fcmulc(<4 x float> %a0, <4 x float> %a1) {
358; ENABLE-LABEL: fcmulc:
359; ENABLE:       # %bb.0:
360; ENABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
361; ENABLE-NEXT:    #APP
362; ENABLE-NEXT:    nop
363; ENABLE-NEXT:    #NO_APP
364; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
365; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
366; ENABLE-NEXT:    vfcmulcph %xmm1, %xmm0, %xmm2
367; ENABLE-NEXT:    vmovaps %xmm2, %xmm0
368; ENABLE-NEXT:    retq
369;
370; DISABLE-LABEL: fcmulc:
371; DISABLE:       # %bb.0:
372; DISABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
373; DISABLE-NEXT:    #APP
374; DISABLE-NEXT:    nop
375; DISABLE-NEXT:    #NO_APP
376; DISABLE-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
377; DISABLE-NEXT:    vmovaps %xmm2, %xmm0
378; DISABLE-NEXT:    retq
379  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
380  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
381  ret <4 x float> %2
382}
383
384define <4 x float> @fcmulc_mem(<4 x float> %a0, ptr %p1) {
385; ENABLE-LABEL: fcmulc_mem:
386; ENABLE:       # %bb.0:
387; ENABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
388; ENABLE-NEXT:    #APP
389; ENABLE-NEXT:    nop
390; ENABLE-NEXT:    #NO_APP
391; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
392; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
393; ENABLE-NEXT:    vfcmulcph (%rdi), %xmm0, %xmm1
394; ENABLE-NEXT:    vmovaps %xmm1, %xmm0
395; ENABLE-NEXT:    retq
396;
397; DISABLE-LABEL: fcmulc_mem:
398; DISABLE:       # %bb.0:
399; DISABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
400; DISABLE-NEXT:    #APP
401; DISABLE-NEXT:    nop
402; DISABLE-NEXT:    #NO_APP
403; DISABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
404; DISABLE-NEXT:    vfcmulcph (%rdi), %xmm0, %xmm1
405; DISABLE-NEXT:    vmovaps %xmm1, %xmm0
406; DISABLE-NEXT:    retq
407  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
408  %a1 = load <4 x float>, ptr %p1, align 64
409  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
410  ret <4 x float> %2
411}
412
413define <4 x float> @fcmulc_broadcast(<4 x float> %a0, ptr %p1) {
414; ENABLE-LABEL: fcmulc_broadcast:
415; ENABLE:       # %bb.0:
416; ENABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
417; ENABLE-NEXT:    #APP
418; ENABLE-NEXT:    nop
419; ENABLE-NEXT:    #NO_APP
420; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
421; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
422; ENABLE-NEXT:    vfcmulcph (%rdi){1to4}, %xmm0, %xmm1
423; ENABLE-NEXT:    vmovaps %xmm1, %xmm0
424; ENABLE-NEXT:    retq
425;
426; DISABLE-LABEL: fcmulc_broadcast:
427; DISABLE:       # %bb.0:
428; DISABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
429; DISABLE-NEXT:    #APP
430; DISABLE-NEXT:    nop
431; DISABLE-NEXT:    #NO_APP
432; DISABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
433; DISABLE-NEXT:    vfcmulcph (%rdi){1to4}, %xmm0, %xmm1
434; DISABLE-NEXT:    vmovaps %xmm1, %xmm0
435; DISABLE-NEXT:    retq
436  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
437  %v1 = load float, ptr %p1, align 4
438  %t0 = insertelement <4 x float> undef, float %v1, i64 0
439  %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer
440  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
441  ret <4 x float> %2
442}
443
444define <4 x float> @fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
445; ENABLE-LABEL: fcmulc_maskz:
446; ENABLE:       # %bb.0:
447; ENABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
448; ENABLE-NEXT:    #APP
449; ENABLE-NEXT:    nop
450; ENABLE-NEXT:    #NO_APP
451; ENABLE-NEXT:    kmovb (%rdi), %k1
452; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
453; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
454; ENABLE-NEXT:    vfcmulcph %xmm1, %xmm0, %xmm2 {%k1} {z}
455; ENABLE-NEXT:    vmovaps %xmm2, %xmm0
456; ENABLE-NEXT:    retq
457;
458; DISABLE-LABEL: fcmulc_maskz:
459; DISABLE:       # %bb.0:
460; DISABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
461; DISABLE-NEXT:    #APP
462; DISABLE-NEXT:    nop
463; DISABLE-NEXT:    #NO_APP
464; DISABLE-NEXT:    kmovb (%rdi), %k1
465; DISABLE-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
466; DISABLE-NEXT:    vmovaps %xmm2, %xmm0
467; DISABLE-NEXT:    retq
468  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
469  %2 = load i8, ptr %mask
470  %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
471  ret <4 x float> %3
472}
473
474define <8 x float> @fmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
475; ENABLE-LABEL: fmulc_ymm:
476; ENABLE:       # %bb.0:
477; ENABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
478; ENABLE-NEXT:    #APP
479; ENABLE-NEXT:    nop
480; ENABLE-NEXT:    #NO_APP
481; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
482; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
483; ENABLE-NEXT:    vfmulcph %ymm1, %ymm0, %ymm2
484; ENABLE-NEXT:    vmovaps %ymm2, %ymm0
485; ENABLE-NEXT:    retq
486;
487; DISABLE-LABEL: fmulc_ymm:
488; DISABLE:       # %bb.0:
489; DISABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
490; DISABLE-NEXT:    #APP
491; DISABLE-NEXT:    nop
492; DISABLE-NEXT:    #NO_APP
493; DISABLE-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
494; DISABLE-NEXT:    vmovaps %ymm2, %ymm0
495; DISABLE-NEXT:    retq
496  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
497  %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
498  ret <8 x float> %2
499}
500
501define <8 x float> @fmulc_ymm_mem(<8 x float> %a0, ptr %p1) {
502; ENABLE-LABEL: fmulc_ymm_mem:
503; ENABLE:       # %bb.0:
504; ENABLE-NEXT:    #APP
505; ENABLE-NEXT:    nop
506; ENABLE-NEXT:    #NO_APP
507; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
508; ENABLE-NEXT:    vfmulcph (%rdi), %ymm0, %ymm1
509; ENABLE-NEXT:    vmovaps %ymm1, %ymm0
510; ENABLE-NEXT:    retq
511;
512; DISABLE-LABEL: fmulc_ymm_mem:
513; DISABLE:       # %bb.0:
514; DISABLE-NEXT:    #APP
515; DISABLE-NEXT:    nop
516; DISABLE-NEXT:    #NO_APP
517; DISABLE-NEXT:    vfmulcph (%rdi), %ymm0, %ymm1
518; DISABLE-NEXT:    vmovaps %ymm1, %ymm0
519; DISABLE-NEXT:    retq
520  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
521  %a1 = load <8 x float>, ptr %p1, align 64
522  %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
523  ret <8 x float> %2
524}
525
526define <8 x float> @fmulc_ymm_broadcast(<8 x float> %a0, ptr %p1) {
527; ENABLE-LABEL: fmulc_ymm_broadcast:
528; ENABLE:       # %bb.0:
529; ENABLE-NEXT:    #APP
530; ENABLE-NEXT:    nop
531; ENABLE-NEXT:    #NO_APP
532; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
533; ENABLE-NEXT:    vfmulcph (%rdi){1to8}, %ymm0, %ymm1
534; ENABLE-NEXT:    vmovaps %ymm1, %ymm0
535; ENABLE-NEXT:    retq
536;
537; DISABLE-LABEL: fmulc_ymm_broadcast:
538; DISABLE:       # %bb.0:
539; DISABLE-NEXT:    #APP
540; DISABLE-NEXT:    nop
541; DISABLE-NEXT:    #NO_APP
542; DISABLE-NEXT:    vfmulcph (%rdi){1to8}, %ymm0, %ymm1
543; DISABLE-NEXT:    vmovaps %ymm1, %ymm0
544; DISABLE-NEXT:    retq
545  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
546  %v1 = load float, ptr %p1, align 4
547  %t0 = insertelement <8 x float> undef, float %v1, i64 0
548  %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
549  %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
550  ret <8 x float> %2
551}
552
553define <8 x float> @fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
554; ENABLE-LABEL: fmulc_maskz_ymm:
555; ENABLE:       # %bb.0:
556; ENABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
557; ENABLE-NEXT:    #APP
558; ENABLE-NEXT:    nop
559; ENABLE-NEXT:    #NO_APP
560; ENABLE-NEXT:    kmovb (%rdi), %k1
561; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
562; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
563; ENABLE-NEXT:    vfmulcph %ymm1, %ymm0, %ymm2 {%k1} {z}
564; ENABLE-NEXT:    vmovaps %ymm2, %ymm0
565; ENABLE-NEXT:    retq
566;
567; DISABLE-LABEL: fmulc_maskz_ymm:
568; DISABLE:       # %bb.0:
569; DISABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
570; DISABLE-NEXT:    #APP
571; DISABLE-NEXT:    nop
572; DISABLE-NEXT:    #NO_APP
573; DISABLE-NEXT:    kmovb (%rdi), %k1
574; DISABLE-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
575; DISABLE-NEXT:    vmovaps %ymm2, %ymm0
576; DISABLE-NEXT:    retq
577  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
578  %2 = load i8, ptr %mask
579  %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
580  ret <8 x float> %3
581}
582
583define <8 x float> @fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
584; ENABLE-LABEL: fcmulc_ymm:
585; ENABLE:       # %bb.0:
586; ENABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
587; ENABLE-NEXT:    #APP
588; ENABLE-NEXT:    nop
589; ENABLE-NEXT:    #NO_APP
590; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
591; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
592; ENABLE-NEXT:    vfcmulcph %ymm1, %ymm0, %ymm2
593; ENABLE-NEXT:    vmovaps %ymm2, %ymm0
594; ENABLE-NEXT:    retq
595;
596; DISABLE-LABEL: fcmulc_ymm:
597; DISABLE:       # %bb.0:
598; DISABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
599; DISABLE-NEXT:    #APP
600; DISABLE-NEXT:    nop
601; DISABLE-NEXT:    #NO_APP
602; DISABLE-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
603; DISABLE-NEXT:    vmovaps %ymm2, %ymm0
604; DISABLE-NEXT:    retq
605  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
606  %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
607  ret <8 x float> %2
608}
609
610define <8 x float> @fcmulc_ymm_mem(<8 x float> %a0, ptr %p1) {
611; ENABLE-LABEL: fcmulc_ymm_mem:
612; ENABLE:       # %bb.0:
613; ENABLE-NEXT:    #APP
614; ENABLE-NEXT:    nop
615; ENABLE-NEXT:    #NO_APP
616; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
617; ENABLE-NEXT:    vfcmulcph (%rdi), %ymm0, %ymm1
618; ENABLE-NEXT:    vmovaps %ymm1, %ymm0
619; ENABLE-NEXT:    retq
620;
621; DISABLE-LABEL: fcmulc_ymm_mem:
622; DISABLE:       # %bb.0:
623; DISABLE-NEXT:    #APP
624; DISABLE-NEXT:    nop
625; DISABLE-NEXT:    #NO_APP
626; DISABLE-NEXT:    vfcmulcph (%rdi), %ymm0, %ymm1
627; DISABLE-NEXT:    vmovaps %ymm1, %ymm0
628; DISABLE-NEXT:    retq
629  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
630  %a1 = load <8 x float>, ptr %p1, align 64
631  %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
632  ret <8 x float> %2
633}
634
635define <8 x float> @fcmulc_ymm_broadcast(<8 x float> %a0, ptr %p1) {
636; ENABLE-LABEL: fcmulc_ymm_broadcast:
637; ENABLE:       # %bb.0:
638; ENABLE-NEXT:    #APP
639; ENABLE-NEXT:    nop
640; ENABLE-NEXT:    #NO_APP
641; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
642; ENABLE-NEXT:    vfcmulcph (%rdi){1to8}, %ymm0, %ymm1
643; ENABLE-NEXT:    vmovaps %ymm1, %ymm0
644; ENABLE-NEXT:    retq
645;
646; DISABLE-LABEL: fcmulc_ymm_broadcast:
647; DISABLE:       # %bb.0:
648; DISABLE-NEXT:    #APP
649; DISABLE-NEXT:    nop
650; DISABLE-NEXT:    #NO_APP
651; DISABLE-NEXT:    vfcmulcph (%rdi){1to8}, %ymm0, %ymm1
652; DISABLE-NEXT:    vmovaps %ymm1, %ymm0
653; DISABLE-NEXT:    retq
654  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
655  %v1 = load float, ptr %p1, align 4
656  %t0 = insertelement <8 x float> undef, float %v1, i64 0
657  %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
658  %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
659  ret <8 x float> %2
660}
661
662define <8 x float> @fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
663; ENABLE-LABEL: fcmulc_maskz_ymm:
664; ENABLE:       # %bb.0:
665; ENABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
666; ENABLE-NEXT:    #APP
667; ENABLE-NEXT:    nop
668; ENABLE-NEXT:    #NO_APP
669; ENABLE-NEXT:    kmovb (%rdi), %k1
670; ENABLE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
671; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
672; ENABLE-NEXT:    vfcmulcph %ymm1, %ymm0, %ymm2 {%k1} {z}
673; ENABLE-NEXT:    vmovaps %ymm2, %ymm0
674; ENABLE-NEXT:    retq
675;
676; DISABLE-LABEL: fcmulc_maskz_ymm:
677; DISABLE:       # %bb.0:
678; DISABLE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
679; DISABLE-NEXT:    #APP
680; DISABLE-NEXT:    nop
681; DISABLE-NEXT:    #NO_APP
682; DISABLE-NEXT:    kmovb (%rdi), %k1
683; DISABLE-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
684; DISABLE-NEXT:    vmovaps %ymm2, %ymm0
685; DISABLE-NEXT:    retq
686  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
687  %2 = load i8, ptr %mask
688  %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
689  ret <8 x float> %3
690}
691
692define <4 x float> @fmulcsh(<4 x float> %a0, <4 x float> %a1) {
693; ENABLE-LABEL: fmulcsh:
694; ENABLE:       # %bb.0:
695; ENABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
696; ENABLE-NEXT:    #APP
697; ENABLE-NEXT:    nop
698; ENABLE-NEXT:    #NO_APP
699; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
700; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
701; ENABLE-NEXT:    vfmulcsh %xmm1, %xmm0, %xmm2
702; ENABLE-NEXT:    vmovaps %xmm2, %xmm0
703; ENABLE-NEXT:    retq
704;
705; DISABLE-LABEL: fmulcsh:
706; DISABLE:       # %bb.0:
707; DISABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
708; DISABLE-NEXT:    #APP
709; DISABLE-NEXT:    nop
710; DISABLE-NEXT:    #NO_APP
711; DISABLE-NEXT:    vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
712; DISABLE-NEXT:    vmovaps %xmm2, %xmm0
713; DISABLE-NEXT:    retq
714  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
715  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
716  ret <4 x float> %2
717}
718
719define <4 x float> @fmulcsh_mem(<4 x float> %a0, ptr %p1) {
720; ENABLE-LABEL: fmulcsh_mem:
721; ENABLE:       # %bb.0:
722; ENABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
723; ENABLE-NEXT:    #APP
724; ENABLE-NEXT:    nop
725; ENABLE-NEXT:    #NO_APP
726; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
727; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
728; ENABLE-NEXT:    vfmulcsh (%rdi), %xmm0, %xmm1
729; ENABLE-NEXT:    vmovaps %xmm1, %xmm0
730; ENABLE-NEXT:    retq
731;
732; DISABLE-LABEL: fmulcsh_mem:
733; DISABLE:       # %bb.0:
734; DISABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
735; DISABLE-NEXT:    #APP
736; DISABLE-NEXT:    nop
737; DISABLE-NEXT:    #NO_APP
738; DISABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
739; DISABLE-NEXT:    vfmulcsh (%rdi), %xmm0, %xmm1
740; DISABLE-NEXT:    vmovaps %xmm1, %xmm0
741; DISABLE-NEXT:    retq
742  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
743  %a1 = load <4 x float>, ptr %p1, align 64
744  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
745  ret <4 x float> %2
746}
747
748define <4 x float> @fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
749; ENABLE-LABEL: fmulcsh_maskz:
750; ENABLE:       # %bb.0:
751; ENABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
752; ENABLE-NEXT:    #APP
753; ENABLE-NEXT:    nop
754; ENABLE-NEXT:    #NO_APP
755; ENABLE-NEXT:    kmovb (%rdi), %k1
756; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
757; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
758; ENABLE-NEXT:    vfmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z}
759; ENABLE-NEXT:    vmovaps %xmm2, %xmm0
760; ENABLE-NEXT:    retq
761;
762; DISABLE-LABEL: fmulcsh_maskz:
763; DISABLE:       # %bb.0:
764; DISABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
765; DISABLE-NEXT:    #APP
766; DISABLE-NEXT:    nop
767; DISABLE-NEXT:    #NO_APP
768; DISABLE-NEXT:    kmovb (%rdi), %k1
769; DISABLE-NEXT:    vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
770; DISABLE-NEXT:    vmovaps %xmm2, %xmm0
771; DISABLE-NEXT:    retq
772  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
773  %2 = load i8, ptr %mask
774  %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
775  ret <4 x float> %3
776}
777
778define <4 x float> @fcmulcsh(<4 x float> %a0, <4 x float> %a1) {
779; ENABLE-LABEL: fcmulcsh:
780; ENABLE:       # %bb.0:
781; ENABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
782; ENABLE-NEXT:    #APP
783; ENABLE-NEXT:    nop
784; ENABLE-NEXT:    #NO_APP
785; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
786; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
787; ENABLE-NEXT:    vfcmulcsh %xmm1, %xmm0, %xmm2
788; ENABLE-NEXT:    vmovaps %xmm2, %xmm0
789; ENABLE-NEXT:    retq
790;
791; DISABLE-LABEL: fcmulcsh:
792; DISABLE:       # %bb.0:
793; DISABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
794; DISABLE-NEXT:    #APP
795; DISABLE-NEXT:    nop
796; DISABLE-NEXT:    #NO_APP
797; DISABLE-NEXT:    vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
798; DISABLE-NEXT:    vmovaps %xmm2, %xmm0
799; DISABLE-NEXT:    retq
800  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
801  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
802  ret <4 x float> %2
803}
804
805define <4 x float> @fcmulcsh_mem(<4 x float> %a0, ptr %p1) {
806; ENABLE-LABEL: fcmulcsh_mem:
807; ENABLE:       # %bb.0:
808; ENABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
809; ENABLE-NEXT:    #APP
810; ENABLE-NEXT:    nop
811; ENABLE-NEXT:    #NO_APP
812; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
813; ENABLE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
814; ENABLE-NEXT:    vfcmulcsh (%rdi), %xmm0, %xmm1
815; ENABLE-NEXT:    vmovaps %xmm1, %xmm0
816; ENABLE-NEXT:    retq
817;
818; DISABLE-LABEL: fcmulcsh_mem:
819; DISABLE:       # %bb.0:
820; DISABLE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
821; DISABLE-NEXT:    #APP
822; DISABLE-NEXT:    nop
823; DISABLE-NEXT:    #NO_APP
824; DISABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
825; DISABLE-NEXT:    vfcmulcsh (%rdi), %xmm0, %xmm1
826; DISABLE-NEXT:    vmovaps %xmm1, %xmm0
827; DISABLE-NEXT:    retq
828  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
829  %a1 = load <4 x float>, ptr %p1, align 64
830  %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
831  ret <4 x float> %2
832}
833
834define <4 x float> @fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
835; ENABLE-LABEL: fcmulcsh_maskz:
836; ENABLE:       # %bb.0:
837; ENABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
838; ENABLE-NEXT:    #APP
839; ENABLE-NEXT:    nop
840; ENABLE-NEXT:    #NO_APP
841; ENABLE-NEXT:    kmovb (%rdi), %k1
842; ENABLE-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
843; ENABLE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
844; ENABLE-NEXT:    vfcmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z}
845; ENABLE-NEXT:    vmovaps %xmm2, %xmm0
846; ENABLE-NEXT:    retq
847;
848; DISABLE-LABEL: fcmulcsh_maskz:
849; DISABLE:       # %bb.0:
850; DISABLE-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
851; DISABLE-NEXT:    #APP
852; DISABLE-NEXT:    nop
853; DISABLE-NEXT:    #NO_APP
854; DISABLE-NEXT:    kmovb (%rdi), %k1
855; DISABLE-NEXT:    vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
856; DISABLE-NEXT:    vmovaps %xmm2, %xmm0
857; DISABLE-NEXT:    retq
858  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
859  %2 = load i8, ptr %mask
860  %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
861  ret <4 x float> %3
862}
863
864declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
865declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
866declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
867declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
868declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
869declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
870declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
871declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
872
873