xref: /llvm-project/llvm/test/CodeGen/X86/avx512fp16-fma-commute.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s --mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s
3
4declare half @llvm.fma.f16(half, half, half)
5declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
6declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
7declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
8
9define half @fma_123_f16(half %x, half %y, half %z) {
10; CHECK-LABEL: fma_123_f16:
11; CHECK:       # %bb.0:
12; CHECK-NEXT:    vfmadd213sh %xmm2, %xmm1, %xmm0
13; CHECK-NEXT:    retq
14  %a = call half @llvm.fma.f16(half %x, half %y, half %z)
15  ret half %a
16}
17
18define half @fma_213_f16(half %x, half %y, half %z) {
19; CHECK-LABEL: fma_213_f16:
20; CHECK:       # %bb.0:
21; CHECK-NEXT:    vfmadd213sh %xmm2, %xmm1, %xmm0
22; CHECK-NEXT:    retq
23  %a = call half @llvm.fma.f16(half %y, half %x, half %z)
24  ret half %a
25}
26
27define half @fma_231_f16(half %x, half %y, half %z) {
28; CHECK-LABEL: fma_231_f16:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    vfmadd231sh %xmm1, %xmm2, %xmm0
31; CHECK-NEXT:    retq
32  %a = call half @llvm.fma.f16(half %y, half %z, half %x)
33  ret half %a
34}
35
36define half @fma_321_f16(half %x, half %y, half %z) {
37; CHECK-LABEL: fma_321_f16:
38; CHECK:       # %bb.0:
39; CHECK-NEXT:    vfmadd231sh %xmm1, %xmm2, %xmm0
40; CHECK-NEXT:    retq
41  %a = call half @llvm.fma.f16(half %z, half %y, half %x)
42  ret half %a
43}
44
45define half @fma_132_f16(half %x, half %y, half %z) {
46; CHECK-LABEL: fma_132_f16:
47; CHECK:       # %bb.0:
48; CHECK-NEXT:    vfmadd213sh %xmm1, %xmm2, %xmm0
49; CHECK-NEXT:    retq
50  %a = call half @llvm.fma.f16(half %x, half %z, half %y)
51  ret half %a
52}
53
54define half @fma_312_f16(half %x, half %y, half %z) {
55; CHECK-LABEL: fma_312_f16:
56; CHECK:       # %bb.0:
57; CHECK-NEXT:    vfmadd213sh %xmm1, %xmm2, %xmm0
58; CHECK-NEXT:    retq
59  %a = call half @llvm.fma.f16(half %z, half %x, half %y)
60  ret half %a
61}
62
63define half @fma_load_123_f16(half %x, half %y, ptr %zp) {
64; CHECK-LABEL: fma_load_123_f16:
65; CHECK:       # %bb.0:
66; CHECK-NEXT:    vfmadd213sh (%rdi), %xmm1, %xmm0
67; CHECK-NEXT:    retq
68  %z = load half, ptr %zp
69  %a = call half @llvm.fma.f16(half %x, half %y, half %z)
70  ret half %a
71}
72
73define half @fma_load_213_f16(half %x, half %y, ptr %zp) {
74; CHECK-LABEL: fma_load_213_f16:
75; CHECK:       # %bb.0:
76; CHECK-NEXT:    vfmadd213sh (%rdi), %xmm1, %xmm0
77; CHECK-NEXT:    retq
78  %z = load half, ptr %zp
79  %a = call half @llvm.fma.f16(half %y, half %x, half %z)
80  ret half %a
81}
82
83define half @fma_load_231_f16(half %x, half %y, ptr %zp) {
84; CHECK-LABEL: fma_load_231_f16:
85; CHECK:       # %bb.0:
86; CHECK-NEXT:    vfmadd231sh (%rdi), %xmm1, %xmm0
87; CHECK-NEXT:    retq
88  %z = load half, ptr %zp
89  %a = call half @llvm.fma.f16(half %y, half %z, half %x)
90  ret half %a
91}
92
93define half @fma_load_321_f16(half %x, half %y, ptr %zp) {
94; CHECK-LABEL: fma_load_321_f16:
95; CHECK:       # %bb.0:
96; CHECK-NEXT:    vfmadd231sh (%rdi), %xmm1, %xmm0
97; CHECK-NEXT:    retq
98  %z = load half, ptr %zp
99  %a = call half @llvm.fma.f16(half %z, half %y, half %x)
100  ret half %a
101}
102
103define half @fma_load_132_f16(half %x, half %y, ptr %zp) {
104; CHECK-LABEL: fma_load_132_f16:
105; CHECK:       # %bb.0:
106; CHECK-NEXT:    vfmadd132sh (%rdi), %xmm1, %xmm0
107; CHECK-NEXT:    retq
108  %z = load half, ptr %zp
109  %a = call half @llvm.fma.f16(half %x, half %z, half %y)
110  ret half %a
111}
112
113define half @fma_load_312_f16(half %x, half %y, ptr %zp) {
114; CHECK-LABEL: fma_load_312_f16:
115; CHECK:       # %bb.0:
116; CHECK-NEXT:    vfmadd132sh (%rdi), %xmm1, %xmm0
117; CHECK-NEXT:    retq
118  %z = load half, ptr %zp
119  %a = call half @llvm.fma.f16(half %z, half %x, half %y)
120  ret half %a
121}
122
123define <8 x half> @fma_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
124; CHECK-LABEL: fma_123_v8f16:
125; CHECK:       # %bb.0:
126; CHECK-NEXT:    vfmadd213ph %xmm2, %xmm1, %xmm0
127; CHECK-NEXT:    retq
128  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
129  ret <8 x half> %a
130}
131
132define <8 x half> @fma_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
133; CHECK-LABEL: fma_213_v8f16:
134; CHECK:       # %bb.0:
135; CHECK-NEXT:    vfmadd213ph %xmm2, %xmm1, %xmm0
136; CHECK-NEXT:    retq
137  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
138  ret <8 x half> %a
139}
140
141define <8 x half> @fma_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
142; CHECK-LABEL: fma_231_v8f16:
143; CHECK:       # %bb.0:
144; CHECK-NEXT:    vfmadd231ph %xmm1, %xmm2, %xmm0
145; CHECK-NEXT:    retq
146  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
147  ret <8 x half> %a
148}
149
150define <8 x half> @fma_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
151; CHECK-LABEL: fma_321_v8f16:
152; CHECK:       # %bb.0:
153; CHECK-NEXT:    vfmadd231ph %xmm1, %xmm2, %xmm0
154; CHECK-NEXT:    retq
155  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
156  ret <8 x half> %a
157}
158
159define <8 x half> @fma_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
160; CHECK-LABEL: fma_132_v8f16:
161; CHECK:       # %bb.0:
162; CHECK-NEXT:    vfmadd213ph %xmm1, %xmm2, %xmm0
163; CHECK-NEXT:    retq
164  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
165  ret <8 x half> %a
166}
167
168define <8 x half> @fma_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) {
169; CHECK-LABEL: fma_312_v8f16:
170; CHECK:       # %bb.0:
171; CHECK-NEXT:    vfmadd213ph %xmm1, %xmm2, %xmm0
172; CHECK-NEXT:    retq
173  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
174  ret <8 x half> %a
175}
176
177define <8 x half> @fma_load_123_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp) {
178; CHECK-LABEL: fma_load_123_v8f16:
179; CHECK:       # %bb.0:
180; CHECK-NEXT:    vfmadd213ph (%rdi), %xmm1, %xmm0
181; CHECK-NEXT:    retq
182  %z = load <8 x half>, ptr %zp
183  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
184  ret <8 x half> %a
185}
186
187define <8 x half> @fma_load_213_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp) {
188; CHECK-LABEL: fma_load_213_v8f16:
189; CHECK:       # %bb.0:
190; CHECK-NEXT:    vfmadd213ph (%rdi), %xmm1, %xmm0
191; CHECK-NEXT:    retq
192  %z = load <8 x half>, ptr %zp
193  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
194  ret <8 x half> %a
195}
196
197define <8 x half> @fma_load_231_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp) {
198; CHECK-LABEL: fma_load_231_v8f16:
199; CHECK:       # %bb.0:
200; CHECK-NEXT:    vfmadd231ph (%rdi), %xmm1, %xmm0
201; CHECK-NEXT:    retq
202  %z = load <8 x half>, ptr %zp
203  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
204  ret <8 x half> %a
205}
206
207define <8 x half> @fma_load_321_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp) {
208; CHECK-LABEL: fma_load_321_v8f16:
209; CHECK:       # %bb.0:
210; CHECK-NEXT:    vfmadd231ph (%rdi), %xmm1, %xmm0
211; CHECK-NEXT:    retq
212  %z = load <8 x half>, ptr %zp
213  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
214  ret <8 x half> %a
215}
216
217define <8 x half> @fma_load_132_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp) {
218; CHECK-LABEL: fma_load_132_v8f16:
219; CHECK:       # %bb.0:
220; CHECK-NEXT:    vfmadd132ph (%rdi), %xmm1, %xmm0
221; CHECK-NEXT:    retq
222  %z = load <8 x half>, ptr %zp
223  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
224  ret <8 x half> %a
225}
226
227define <8 x half> @fma_load_312_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp) {
228; CHECK-LABEL: fma_load_312_v8f16:
229; CHECK:       # %bb.0:
230; CHECK-NEXT:    vfmadd132ph (%rdi), %xmm1, %xmm0
231; CHECK-NEXT:    retq
232  %z = load <8 x half>, ptr %zp
233  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
234  ret <8 x half> %a
235}
236
237define <8 x half> @fma_mask_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
238; CHECK-LABEL: fma_mask_123_v8f16:
239; CHECK:       # %bb.0:
240; CHECK-NEXT:    kmovd %edi, %k1
241; CHECK-NEXT:    vfmadd132ph %xmm1, %xmm2, %xmm0 {%k1}
242; CHECK-NEXT:    retq
243  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
244  %b = bitcast i8 %mask to <8 x i1>
245  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
246  ret <8 x half> %c
247}
248
249define <8 x half> @fma_mask_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
250; CHECK-LABEL: fma_mask_213_v8f16:
251; CHECK:       # %bb.0:
252; CHECK-NEXT:    kmovd %edi, %k1
253; CHECK-NEXT:    vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1}
254; CHECK-NEXT:    retq
255  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
256  %b = bitcast i8 %mask to <8 x i1>
257  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
258  ret <8 x half> %c
259}
260
261define <8 x half> @fma_mask_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
262; CHECK-LABEL: fma_mask_231_v8f16:
263; CHECK:       # %bb.0:
264; CHECK-NEXT:    kmovd %edi, %k1
265; CHECK-NEXT:    vfmadd231ph %xmm2, %xmm1, %xmm0 {%k1}
266; CHECK-NEXT:    retq
267  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
268  %b = bitcast i8 %mask to <8 x i1>
269  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
270  ret <8 x half> %c
271}
272
273define <8 x half> @fma_mask_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
274; CHECK-LABEL: fma_mask_321_v8f16:
275; CHECK:       # %bb.0:
276; CHECK-NEXT:    kmovd %edi, %k1
277; CHECK-NEXT:    vfmadd231ph %xmm1, %xmm2, %xmm0 {%k1}
278; CHECK-NEXT:    retq
279  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
280  %b = bitcast i8 %mask to <8 x i1>
281  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
282  ret <8 x half> %c
283}
284
285define <8 x half> @fma_mask_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
286; CHECK-LABEL: fma_mask_132_v8f16:
287; CHECK:       # %bb.0:
288; CHECK-NEXT:    kmovd %edi, %k1
289; CHECK-NEXT:    vfmadd132ph %xmm2, %xmm1, %xmm0 {%k1}
290; CHECK-NEXT:    retq
291  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
292  %b = bitcast i8 %mask to <8 x i1>
293  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
294  ret <8 x half> %c
295}
296
297define <8 x half> @fma_mask_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
298; CHECK-LABEL: fma_mask_312_v8f16:
299; CHECK:       # %bb.0:
300; CHECK-NEXT:    kmovd %edi, %k1
301; CHECK-NEXT:    vfmadd213ph %xmm1, %xmm2, %xmm0 {%k1}
302; CHECK-NEXT:    retq
303  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
304  %b = bitcast i8 %mask to <8 x i1>
305  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
306  ret <8 x half> %c
307}
308
309define <8 x half> @fma_maskz_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
310; CHECK-LABEL: fma_maskz_123_v8f16:
311; CHECK:       # %bb.0:
312; CHECK-NEXT:    kmovd %edi, %k1
313; CHECK-NEXT:    vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z}
314; CHECK-NEXT:    retq
315  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
316  %b = bitcast i8 %mask to <8 x i1>
317  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
318  ret <8 x half> %c
319}
320
321define <8 x half> @fma_maskz_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
322; CHECK-LABEL: fma_maskz_213_v8f16:
323; CHECK:       # %bb.0:
324; CHECK-NEXT:    kmovd %edi, %k1
325; CHECK-NEXT:    vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z}
326; CHECK-NEXT:    retq
327  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
328  %b = bitcast i8 %mask to <8 x i1>
329  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
330  ret <8 x half> %c
331}
332
333define <8 x half> @fma_maskz_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
334; CHECK-LABEL: fma_maskz_231_v8f16:
335; CHECK:       # %bb.0:
336; CHECK-NEXT:    kmovd %edi, %k1
337; CHECK-NEXT:    vfmadd231ph %xmm1, %xmm2, %xmm0 {%k1} {z}
338; CHECK-NEXT:    retq
339  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
340  %b = bitcast i8 %mask to <8 x i1>
341  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
342  ret <8 x half> %c
343}
344
345define <8 x half> @fma_maskz_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
346; CHECK-LABEL: fma_maskz_321_v8f16:
347; CHECK:       # %bb.0:
348; CHECK-NEXT:    kmovd %edi, %k1
349; CHECK-NEXT:    vfmadd231ph %xmm1, %xmm2, %xmm0 {%k1} {z}
350; CHECK-NEXT:    retq
351  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
352  %b = bitcast i8 %mask to <8 x i1>
353  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
354  ret <8 x half> %c
355}
356
357define <8 x half> @fma_maskz_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
358; CHECK-LABEL: fma_maskz_132_v8f16:
359; CHECK:       # %bb.0:
360; CHECK-NEXT:    kmovd %edi, %k1
361; CHECK-NEXT:    vfmadd213ph %xmm1, %xmm2, %xmm0 {%k1} {z}
362; CHECK-NEXT:    retq
363  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
364  %b = bitcast i8 %mask to <8 x i1>
365  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
366  ret <8 x half> %c
367}
368
369define <8 x half> @fma_maskz_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) {
370; CHECK-LABEL: fma_maskz_312_v8f16:
371; CHECK:       # %bb.0:
372; CHECK-NEXT:    kmovd %edi, %k1
373; CHECK-NEXT:    vfmadd213ph %xmm1, %xmm2, %xmm0 {%k1} {z}
374; CHECK-NEXT:    retq
375  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
376  %b = bitcast i8 %mask to <8 x i1>
377  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
378  ret <8 x half> %c
379}
380
381define <8 x half> @fma_mask_load_123_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
382; CHECK-LABEL: fma_mask_load_123_v8f16:
383; CHECK:       # %bb.0:
384; CHECK-NEXT:    kmovd %esi, %k1
385; CHECK-NEXT:    vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1}
386; CHECK-NEXT:    retq
387  %z = load <8 x half>, ptr %zp
388  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
389  %b = bitcast i8 %mask to <8 x i1>
390  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
391  ret <8 x half> %c
392}
393
394define <8 x half> @fma_mask_load_213_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
395; CHECK-LABEL: fma_mask_load_213_v8f16:
396; CHECK:       # %bb.0:
397; CHECK-NEXT:    kmovd %esi, %k1
398; CHECK-NEXT:    vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1}
399; CHECK-NEXT:    retq
400  %z = load <8 x half>, ptr %zp
401  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
402  %b = bitcast i8 %mask to <8 x i1>
403  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
404  ret <8 x half> %c
405}
406
407define <8 x half> @fma_mask_load_231_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
408; CHECK-LABEL: fma_mask_load_231_v8f16:
409; CHECK:       # %bb.0:
410; CHECK-NEXT:    kmovd %esi, %k1
411; CHECK-NEXT:    vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1}
412; CHECK-NEXT:    retq
413  %z = load <8 x half>, ptr %zp
414  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
415  %b = bitcast i8 %mask to <8 x i1>
416  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
417  ret <8 x half> %c
418}
419
420define <8 x half> @fma_mask_load_321_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
421; CHECK-LABEL: fma_mask_load_321_v8f16:
422; CHECK:       # %bb.0:
423; CHECK-NEXT:    kmovd %esi, %k1
424; CHECK-NEXT:    vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1}
425; CHECK-NEXT:    retq
426  %z = load <8 x half>, ptr %zp
427  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
428  %b = bitcast i8 %mask to <8 x i1>
429  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
430  ret <8 x half> %c
431}
432
433define <8 x half> @fma_mask_load_132_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
434; CHECK-LABEL: fma_mask_load_132_v8f16:
435; CHECK:       # %bb.0:
436; CHECK-NEXT:    kmovd %esi, %k1
437; CHECK-NEXT:    vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1}
438; CHECK-NEXT:    retq
439  %z = load <8 x half>, ptr %zp
440  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
441  %b = bitcast i8 %mask to <8 x i1>
442  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
443  ret <8 x half> %c
444}
445
446define <8 x half> @fma_mask_load_312_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
447; CHECK-LABEL: fma_mask_load_312_v8f16:
448; CHECK:       # %bb.0:
449; CHECK-NEXT:    kmovd %esi, %k1
450; CHECK-NEXT:    vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1}
451; CHECK-NEXT:    retq
452  %z = load <8 x half>, ptr %zp
453  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
454  %b = bitcast i8 %mask to <8 x i1>
455  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x
456  ret <8 x half> %c
457}
458
459define <8 x half> @fma_maskz_load_123_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
460; CHECK-LABEL: fma_maskz_load_123_v8f16:
461; CHECK:       # %bb.0:
462; CHECK-NEXT:    kmovd %esi, %k1
463; CHECK-NEXT:    vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1} {z}
464; CHECK-NEXT:    retq
465  %z = load <8 x half>, ptr %zp
466  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z)
467  %b = bitcast i8 %mask to <8 x i1>
468  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
469  ret <8 x half> %c
470}
471
472define <8 x half> @fma_maskz_load_213_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
473; CHECK-LABEL: fma_maskz_load_213_v8f16:
474; CHECK:       # %bb.0:
475; CHECK-NEXT:    kmovd %esi, %k1
476; CHECK-NEXT:    vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1} {z}
477; CHECK-NEXT:    retq
478  %z = load <8 x half>, ptr %zp
479  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z)
480  %b = bitcast i8 %mask to <8 x i1>
481  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
482  ret <8 x half> %c
483}
484
485define <8 x half> @fma_maskz_load_231_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
486; CHECK-LABEL: fma_maskz_load_231_v8f16:
487; CHECK:       # %bb.0:
488; CHECK-NEXT:    kmovd %esi, %k1
489; CHECK-NEXT:    vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1} {z}
490; CHECK-NEXT:    retq
491  %z = load <8 x half>, ptr %zp
492  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x)
493  %b = bitcast i8 %mask to <8 x i1>
494  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
495  ret <8 x half> %c
496}
497
498define <8 x half> @fma_maskz_load_321_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
499; CHECK-LABEL: fma_maskz_load_321_v8f16:
500; CHECK:       # %bb.0:
501; CHECK-NEXT:    kmovd %esi, %k1
502; CHECK-NEXT:    vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1} {z}
503; CHECK-NEXT:    retq
504  %z = load <8 x half>, ptr %zp
505  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x)
506  %b = bitcast i8 %mask to <8 x i1>
507  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
508  ret <8 x half> %c
509}
510
511define <8 x half> @fma_maskz_load_132_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
512; CHECK-LABEL: fma_maskz_load_132_v8f16:
513; CHECK:       # %bb.0:
514; CHECK-NEXT:    kmovd %esi, %k1
515; CHECK-NEXT:    vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1} {z}
516; CHECK-NEXT:    retq
517  %z = load <8 x half>, ptr %zp
518  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y)
519  %b = bitcast i8 %mask to <8 x i1>
520  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
521  ret <8 x half> %c
522}
523
524define <8 x half> @fma_maskz_load_312_v8f16(<8 x half> %x, <8 x half> %y, ptr %zp, i8 %mask) {
525; CHECK-LABEL: fma_maskz_load_312_v8f16:
526; CHECK:       # %bb.0:
527; CHECK-NEXT:    kmovd %esi, %k1
528; CHECK-NEXT:    vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1} {z}
529; CHECK-NEXT:    retq
530  %z = load <8 x half>, ptr %zp
531  %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y)
532  %b = bitcast i8 %mask to <8 x i1>
533  %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer
534  ret <8 x half> %c
535}
536
537define <16 x half> @fma_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
538; CHECK-LABEL: fma_123_v16f16:
539; CHECK:       # %bb.0:
540; CHECK-NEXT:    vfmadd213ph %ymm2, %ymm1, %ymm0
541; CHECK-NEXT:    retq
542  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
543  ret <16 x half> %a
544}
545
546define <16 x half> @fma_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
547; CHECK-LABEL: fma_213_v16f16:
548; CHECK:       # %bb.0:
549; CHECK-NEXT:    vfmadd213ph %ymm2, %ymm1, %ymm0
550; CHECK-NEXT:    retq
551  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
552  ret <16 x half> %a
553}
554
555define <16 x half> @fma_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
556; CHECK-LABEL: fma_231_v16f16:
557; CHECK:       # %bb.0:
558; CHECK-NEXT:    vfmadd231ph %ymm1, %ymm2, %ymm0
559; CHECK-NEXT:    retq
560  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
561  ret <16 x half> %a
562}
563
564define <16 x half> @fma_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
565; CHECK-LABEL: fma_321_v16f16:
566; CHECK:       # %bb.0:
567; CHECK-NEXT:    vfmadd231ph %ymm1, %ymm2, %ymm0
568; CHECK-NEXT:    retq
569  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
570  ret <16 x half> %a
571}
572
573define <16 x half> @fma_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
574; CHECK-LABEL: fma_132_v16f16:
575; CHECK:       # %bb.0:
576; CHECK-NEXT:    vfmadd213ph %ymm1, %ymm2, %ymm0
577; CHECK-NEXT:    retq
578  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
579  ret <16 x half> %a
580}
581
582define <16 x half> @fma_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) {
583; CHECK-LABEL: fma_312_v16f16:
584; CHECK:       # %bb.0:
585; CHECK-NEXT:    vfmadd213ph %ymm1, %ymm2, %ymm0
586; CHECK-NEXT:    retq
587  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
588  ret <16 x half> %a
589}
590
591define <16 x half> @fma_load_123_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp) {
592; CHECK-LABEL: fma_load_123_v16f16:
593; CHECK:       # %bb.0:
594; CHECK-NEXT:    vfmadd213ph (%rdi), %ymm1, %ymm0
595; CHECK-NEXT:    retq
596  %z = load <16 x half>, ptr %zp
597  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
598  ret <16 x half> %a
599}
600
601define <16 x half> @fma_load_213_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp) {
602; CHECK-LABEL: fma_load_213_v16f16:
603; CHECK:       # %bb.0:
604; CHECK-NEXT:    vfmadd213ph (%rdi), %ymm1, %ymm0
605; CHECK-NEXT:    retq
606  %z = load <16 x half>, ptr %zp
607  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
608  ret <16 x half> %a
609}
610
611define <16 x half> @fma_load_231_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp) {
612; CHECK-LABEL: fma_load_231_v16f16:
613; CHECK:       # %bb.0:
614; CHECK-NEXT:    vfmadd231ph (%rdi), %ymm1, %ymm0
615; CHECK-NEXT:    retq
616  %z = load <16 x half>, ptr %zp
617  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
618  ret <16 x half> %a
619}
620
621define <16 x half> @fma_load_321_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp) {
622; CHECK-LABEL: fma_load_321_v16f16:
623; CHECK:       # %bb.0:
624; CHECK-NEXT:    vfmadd231ph (%rdi), %ymm1, %ymm0
625; CHECK-NEXT:    retq
626  %z = load <16 x half>, ptr %zp
627  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
628  ret <16 x half> %a
629}
630
631define <16 x half> @fma_load_132_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp) {
632; CHECK-LABEL: fma_load_132_v16f16:
633; CHECK:       # %bb.0:
634; CHECK-NEXT:    vfmadd132ph (%rdi), %ymm1, %ymm0
635; CHECK-NEXT:    retq
636  %z = load <16 x half>, ptr %zp
637  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
638  ret <16 x half> %a
639}
640
641define <16 x half> @fma_load_312_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp) {
642; CHECK-LABEL: fma_load_312_v16f16:
643; CHECK:       # %bb.0:
644; CHECK-NEXT:    vfmadd132ph (%rdi), %ymm1, %ymm0
645; CHECK-NEXT:    retq
646  %z = load <16 x half>, ptr %zp
647  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
648  ret <16 x half> %a
649}
650
651define <16 x half> @fma_mask_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
652; CHECK-LABEL: fma_mask_123_v16f16:
653; CHECK:       # %bb.0:
654; CHECK-NEXT:    kmovd %edi, %k1
655; CHECK-NEXT:    vfmadd132ph %ymm1, %ymm2, %ymm0 {%k1}
656; CHECK-NEXT:    retq
657  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
658  %b = bitcast i16 %mask to <16 x i1>
659  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
660  ret <16 x half> %c
661}
662
663define <16 x half> @fma_mask_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
664; CHECK-LABEL: fma_mask_213_v16f16:
665; CHECK:       # %bb.0:
666; CHECK-NEXT:    kmovd %edi, %k1
667; CHECK-NEXT:    vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1}
668; CHECK-NEXT:    retq
669  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
670  %b = bitcast i16 %mask to <16 x i1>
671  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
672  ret <16 x half> %c
673}
674
675define <16 x half> @fma_mask_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
676; CHECK-LABEL: fma_mask_231_v16f16:
677; CHECK:       # %bb.0:
678; CHECK-NEXT:    kmovd %edi, %k1
679; CHECK-NEXT:    vfmadd231ph %ymm2, %ymm1, %ymm0 {%k1}
680; CHECK-NEXT:    retq
681  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
682  %b = bitcast i16 %mask to <16 x i1>
683  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
684  ret <16 x half> %c
685}
686
687define <16 x half> @fma_mask_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
688; CHECK-LABEL: fma_mask_321_v16f16:
689; CHECK:       # %bb.0:
690; CHECK-NEXT:    kmovd %edi, %k1
691; CHECK-NEXT:    vfmadd231ph %ymm1, %ymm2, %ymm0 {%k1}
692; CHECK-NEXT:    retq
693  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
694  %b = bitcast i16 %mask to <16 x i1>
695  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
696  ret <16 x half> %c
697}
698
699define <16 x half> @fma_mask_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
700; CHECK-LABEL: fma_mask_132_v16f16:
701; CHECK:       # %bb.0:
702; CHECK-NEXT:    kmovd %edi, %k1
703; CHECK-NEXT:    vfmadd132ph %ymm2, %ymm1, %ymm0 {%k1}
704; CHECK-NEXT:    retq
705  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
706  %b = bitcast i16 %mask to <16 x i1>
707  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
708  ret <16 x half> %c
709}
710
711define <16 x half> @fma_mask_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
712; CHECK-LABEL: fma_mask_312_v16f16:
713; CHECK:       # %bb.0:
714; CHECK-NEXT:    kmovd %edi, %k1
715; CHECK-NEXT:    vfmadd213ph %ymm1, %ymm2, %ymm0 {%k1}
716; CHECK-NEXT:    retq
717  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
718  %b = bitcast i16 %mask to <16 x i1>
719  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
720  ret <16 x half> %c
721}
722
723define <16 x half> @fma_maskz_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
724; CHECK-LABEL: fma_maskz_123_v16f16:
725; CHECK:       # %bb.0:
726; CHECK-NEXT:    kmovd %edi, %k1
727; CHECK-NEXT:    vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z}
728; CHECK-NEXT:    retq
729  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
730  %b = bitcast i16 %mask to <16 x i1>
731  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
732  ret <16 x half> %c
733}
734
735define <16 x half> @fma_maskz_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
736; CHECK-LABEL: fma_maskz_213_v16f16:
737; CHECK:       # %bb.0:
738; CHECK-NEXT:    kmovd %edi, %k1
739; CHECK-NEXT:    vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z}
740; CHECK-NEXT:    retq
741  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
742  %b = bitcast i16 %mask to <16 x i1>
743  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
744  ret <16 x half> %c
745}
746
747define <16 x half> @fma_maskz_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
748; CHECK-LABEL: fma_maskz_231_v16f16:
749; CHECK:       # %bb.0:
750; CHECK-NEXT:    kmovd %edi, %k1
751; CHECK-NEXT:    vfmadd231ph %ymm1, %ymm2, %ymm0 {%k1} {z}
752; CHECK-NEXT:    retq
753  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
754  %b = bitcast i16 %mask to <16 x i1>
755  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
756  ret <16 x half> %c
757}
758
759define <16 x half> @fma_maskz_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
760; CHECK-LABEL: fma_maskz_321_v16f16:
761; CHECK:       # %bb.0:
762; CHECK-NEXT:    kmovd %edi, %k1
763; CHECK-NEXT:    vfmadd231ph %ymm1, %ymm2, %ymm0 {%k1} {z}
764; CHECK-NEXT:    retq
765  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
766  %b = bitcast i16 %mask to <16 x i1>
767  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
768  ret <16 x half> %c
769}
770
771define <16 x half> @fma_maskz_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
772; CHECK-LABEL: fma_maskz_132_v16f16:
773; CHECK:       # %bb.0:
774; CHECK-NEXT:    kmovd %edi, %k1
775; CHECK-NEXT:    vfmadd213ph %ymm1, %ymm2, %ymm0 {%k1} {z}
776; CHECK-NEXT:    retq
777  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
778  %b = bitcast i16 %mask to <16 x i1>
779  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
780  ret <16 x half> %c
781}
782
783define <16 x half> @fma_maskz_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) {
784; CHECK-LABEL: fma_maskz_312_v16f16:
785; CHECK:       # %bb.0:
786; CHECK-NEXT:    kmovd %edi, %k1
787; CHECK-NEXT:    vfmadd213ph %ymm1, %ymm2, %ymm0 {%k1} {z}
788; CHECK-NEXT:    retq
789  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
790  %b = bitcast i16 %mask to <16 x i1>
791  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
792  ret <16 x half> %c
793}
794
795define <16 x half> @fma_mask_load_123_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
796; CHECK-LABEL: fma_mask_load_123_v16f16:
797; CHECK:       # %bb.0:
798; CHECK-NEXT:    kmovd %esi, %k1
799; CHECK-NEXT:    vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1}
800; CHECK-NEXT:    retq
801  %z = load <16 x half>, ptr %zp
802  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
803  %b = bitcast i16 %mask to <16 x i1>
804  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
805  ret <16 x half> %c
806}
807
808define <16 x half> @fma_mask_load_213_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
809; CHECK-LABEL: fma_mask_load_213_v16f16:
810; CHECK:       # %bb.0:
811; CHECK-NEXT:    kmovd %esi, %k1
812; CHECK-NEXT:    vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1}
813; CHECK-NEXT:    retq
814  %z = load <16 x half>, ptr %zp
815  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
816  %b = bitcast i16 %mask to <16 x i1>
817  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
818  ret <16 x half> %c
819}
820
821define <16 x half> @fma_mask_load_231_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
822; CHECK-LABEL: fma_mask_load_231_v16f16:
823; CHECK:       # %bb.0:
824; CHECK-NEXT:    kmovd %esi, %k1
825; CHECK-NEXT:    vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1}
826; CHECK-NEXT:    retq
827  %z = load <16 x half>, ptr %zp
828  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
829  %b = bitcast i16 %mask to <16 x i1>
830  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
831  ret <16 x half> %c
832}
833
834define <16 x half> @fma_mask_load_321_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
835; CHECK-LABEL: fma_mask_load_321_v16f16:
836; CHECK:       # %bb.0:
837; CHECK-NEXT:    kmovd %esi, %k1
838; CHECK-NEXT:    vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1}
839; CHECK-NEXT:    retq
840  %z = load <16 x half>, ptr %zp
841  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
842  %b = bitcast i16 %mask to <16 x i1>
843  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
844  ret <16 x half> %c
845}
846
847define <16 x half> @fma_mask_load_132_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
848; CHECK-LABEL: fma_mask_load_132_v16f16:
849; CHECK:       # %bb.0:
850; CHECK-NEXT:    kmovd %esi, %k1
851; CHECK-NEXT:    vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1}
852; CHECK-NEXT:    retq
853  %z = load <16 x half>, ptr %zp
854  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
855  %b = bitcast i16 %mask to <16 x i1>
856  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
857  ret <16 x half> %c
858}
859
860define <16 x half> @fma_mask_load_312_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
861; CHECK-LABEL: fma_mask_load_312_v16f16:
862; CHECK:       # %bb.0:
863; CHECK-NEXT:    kmovd %esi, %k1
864; CHECK-NEXT:    vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1}
865; CHECK-NEXT:    retq
866  %z = load <16 x half>, ptr %zp
867  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
868  %b = bitcast i16 %mask to <16 x i1>
869  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x
870  ret <16 x half> %c
871}
872
873define <16 x half> @fma_maskz_load_123_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
874; CHECK-LABEL: fma_maskz_load_123_v16f16:
875; CHECK:       # %bb.0:
876; CHECK-NEXT:    kmovd %esi, %k1
877; CHECK-NEXT:    vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1} {z}
878; CHECK-NEXT:    retq
879  %z = load <16 x half>, ptr %zp
880  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z)
881  %b = bitcast i16 %mask to <16 x i1>
882  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
883  ret <16 x half> %c
884}
885
886define <16 x half> @fma_maskz_load_213_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
887; CHECK-LABEL: fma_maskz_load_213_v16f16:
888; CHECK:       # %bb.0:
889; CHECK-NEXT:    kmovd %esi, %k1
890; CHECK-NEXT:    vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1} {z}
891; CHECK-NEXT:    retq
892  %z = load <16 x half>, ptr %zp
893  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z)
894  %b = bitcast i16 %mask to <16 x i1>
895  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
896  ret <16 x half> %c
897}
898
899define <16 x half> @fma_maskz_load_231_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
900; CHECK-LABEL: fma_maskz_load_231_v16f16:
901; CHECK:       # %bb.0:
902; CHECK-NEXT:    kmovd %esi, %k1
903; CHECK-NEXT:    vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1} {z}
904; CHECK-NEXT:    retq
905  %z = load <16 x half>, ptr %zp
906  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x)
907  %b = bitcast i16 %mask to <16 x i1>
908  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
909  ret <16 x half> %c
910}
911
912define <16 x half> @fma_maskz_load_321_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
913; CHECK-LABEL: fma_maskz_load_321_v16f16:
914; CHECK:       # %bb.0:
915; CHECK-NEXT:    kmovd %esi, %k1
916; CHECK-NEXT:    vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1} {z}
917; CHECK-NEXT:    retq
918  %z = load <16 x half>, ptr %zp
919  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x)
920  %b = bitcast i16 %mask to <16 x i1>
921  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
922  ret <16 x half> %c
923}
924
925define <16 x half> @fma_maskz_load_132_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
926; CHECK-LABEL: fma_maskz_load_132_v16f16:
927; CHECK:       # %bb.0:
928; CHECK-NEXT:    kmovd %esi, %k1
929; CHECK-NEXT:    vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1} {z}
930; CHECK-NEXT:    retq
931  %z = load <16 x half>, ptr %zp
932  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y)
933  %b = bitcast i16 %mask to <16 x i1>
934  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
935  ret <16 x half> %c
936}
937
938define <16 x half> @fma_maskz_load_312_v16f16(<16 x half> %x, <16 x half> %y, ptr %zp, i16 %mask) {
939; CHECK-LABEL: fma_maskz_load_312_v16f16:
940; CHECK:       # %bb.0:
941; CHECK-NEXT:    kmovd %esi, %k1
942; CHECK-NEXT:    vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1} {z}
943; CHECK-NEXT:    retq
944  %z = load <16 x half>, ptr %zp
945  %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y)
946  %b = bitcast i16 %mask to <16 x i1>
947  %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer
948  ret <16 x half> %c
949}
950
951define <32 x half> @fma_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
952; CHECK-LABEL: fma_123_v32f16:
953; CHECK:       # %bb.0:
954; CHECK-NEXT:    vfmadd213ph %zmm2, %zmm1, %zmm0
955; CHECK-NEXT:    retq
956  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
957  ret <32 x half> %a
958}
959
960define <32 x half> @fma_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
961; CHECK-LABEL: fma_213_v32f16:
962; CHECK:       # %bb.0:
963; CHECK-NEXT:    vfmadd213ph %zmm2, %zmm1, %zmm0
964; CHECK-NEXT:    retq
965  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
966  ret <32 x half> %a
967}
968
969define <32 x half> @fma_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
970; CHECK-LABEL: fma_231_v32f16:
971; CHECK:       # %bb.0:
972; CHECK-NEXT:    vfmadd231ph %zmm1, %zmm2, %zmm0
973; CHECK-NEXT:    retq
974  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
975  ret <32 x half> %a
976}
977
978define <32 x half> @fma_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
979; CHECK-LABEL: fma_321_v32f16:
980; CHECK:       # %bb.0:
981; CHECK-NEXT:    vfmadd231ph %zmm1, %zmm2, %zmm0
982; CHECK-NEXT:    retq
983  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
984  ret <32 x half> %a
985}
986
987define <32 x half> @fma_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
988; CHECK-LABEL: fma_132_v32f16:
989; CHECK:       # %bb.0:
990; CHECK-NEXT:    vfmadd213ph %zmm1, %zmm2, %zmm0
991; CHECK-NEXT:    retq
992  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
993  ret <32 x half> %a
994}
995
996define <32 x half> @fma_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) {
997; CHECK-LABEL: fma_312_v32f16:
998; CHECK:       # %bb.0:
999; CHECK-NEXT:    vfmadd213ph %zmm1, %zmm2, %zmm0
1000; CHECK-NEXT:    retq
1001  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
1002  ret <32 x half> %a
1003}
1004
1005define <32 x half> @fma_load_123_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp) {
1006; CHECK-LABEL: fma_load_123_v32f16:
1007; CHECK:       # %bb.0:
1008; CHECK-NEXT:    vfmadd213ph (%rdi), %zmm1, %zmm0
1009; CHECK-NEXT:    retq
1010  %z = load <32 x half>, ptr %zp
1011  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
1012  ret <32 x half> %a
1013}
1014
1015define <32 x half> @fma_load_213_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp) {
1016; CHECK-LABEL: fma_load_213_v32f16:
1017; CHECK:       # %bb.0:
1018; CHECK-NEXT:    vfmadd213ph (%rdi), %zmm1, %zmm0
1019; CHECK-NEXT:    retq
1020  %z = load <32 x half>, ptr %zp
1021  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
1022  ret <32 x half> %a
1023}
1024
1025define <32 x half> @fma_load_231_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp) {
1026; CHECK-LABEL: fma_load_231_v32f16:
1027; CHECK:       # %bb.0:
1028; CHECK-NEXT:    vfmadd231ph (%rdi), %zmm1, %zmm0
1029; CHECK-NEXT:    retq
1030  %z = load <32 x half>, ptr %zp
1031  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
1032  ret <32 x half> %a
1033}
1034
1035define <32 x half> @fma_load_321_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp) {
1036; CHECK-LABEL: fma_load_321_v32f16:
1037; CHECK:       # %bb.0:
1038; CHECK-NEXT:    vfmadd231ph (%rdi), %zmm1, %zmm0
1039; CHECK-NEXT:    retq
1040  %z = load <32 x half>, ptr %zp
1041  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
1042  ret <32 x half> %a
1043}
1044
1045define <32 x half> @fma_load_132_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp) {
1046; CHECK-LABEL: fma_load_132_v32f16:
1047; CHECK:       # %bb.0:
1048; CHECK-NEXT:    vfmadd132ph (%rdi), %zmm1, %zmm0
1049; CHECK-NEXT:    retq
1050  %z = load <32 x half>, ptr %zp
1051  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
1052  ret <32 x half> %a
1053}
1054
1055define <32 x half> @fma_load_312_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp) {
1056; CHECK-LABEL: fma_load_312_v32f16:
1057; CHECK:       # %bb.0:
1058; CHECK-NEXT:    vfmadd132ph (%rdi), %zmm1, %zmm0
1059; CHECK-NEXT:    retq
1060  %z = load <32 x half>, ptr %zp
1061  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
1062  ret <32 x half> %a
1063}
1064
1065define <32 x half> @fma_mask_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1066; CHECK-LABEL: fma_mask_123_v32f16:
1067; CHECK:       # %bb.0:
1068; CHECK-NEXT:    kmovd %edi, %k1
1069; CHECK-NEXT:    vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1}
1070; CHECK-NEXT:    retq
1071  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
1072  %b = bitcast i32 %mask to <32 x i1>
1073  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1074  ret <32 x half> %c
1075}
1076
1077define <32 x half> @fma_mask_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1078; CHECK-LABEL: fma_mask_213_v32f16:
1079; CHECK:       # %bb.0:
1080; CHECK-NEXT:    kmovd %edi, %k1
1081; CHECK-NEXT:    vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1}
1082; CHECK-NEXT:    retq
1083  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
1084  %b = bitcast i32 %mask to <32 x i1>
1085  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1086  ret <32 x half> %c
1087}
1088
1089define <32 x half> @fma_mask_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1090; CHECK-LABEL: fma_mask_231_v32f16:
1091; CHECK:       # %bb.0:
1092; CHECK-NEXT:    kmovd %edi, %k1
1093; CHECK-NEXT:    vfmadd231ph %zmm2, %zmm1, %zmm0 {%k1}
1094; CHECK-NEXT:    retq
1095  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
1096  %b = bitcast i32 %mask to <32 x i1>
1097  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1098  ret <32 x half> %c
1099}
1100
1101define <32 x half> @fma_mask_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1102; CHECK-LABEL: fma_mask_321_v32f16:
1103; CHECK:       # %bb.0:
1104; CHECK-NEXT:    kmovd %edi, %k1
1105; CHECK-NEXT:    vfmadd231ph %zmm1, %zmm2, %zmm0 {%k1}
1106; CHECK-NEXT:    retq
1107  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
1108  %b = bitcast i32 %mask to <32 x i1>
1109  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1110  ret <32 x half> %c
1111}
1112
1113define <32 x half> @fma_mask_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1114; CHECK-LABEL: fma_mask_132_v32f16:
1115; CHECK:       # %bb.0:
1116; CHECK-NEXT:    kmovd %edi, %k1
1117; CHECK-NEXT:    vfmadd132ph %zmm2, %zmm1, %zmm0 {%k1}
1118; CHECK-NEXT:    retq
1119  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
1120  %b = bitcast i32 %mask to <32 x i1>
1121  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1122  ret <32 x half> %c
1123}
1124
1125define <32 x half> @fma_mask_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1126; CHECK-LABEL: fma_mask_312_v32f16:
1127; CHECK:       # %bb.0:
1128; CHECK-NEXT:    kmovd %edi, %k1
1129; CHECK-NEXT:    vfmadd213ph %zmm1, %zmm2, %zmm0 {%k1}
1130; CHECK-NEXT:    retq
1131  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
1132  %b = bitcast i32 %mask to <32 x i1>
1133  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1134  ret <32 x half> %c
1135}
1136
1137define <32 x half> @fma_maskz_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1138; CHECK-LABEL: fma_maskz_123_v32f16:
1139; CHECK:       # %bb.0:
1140; CHECK-NEXT:    kmovd %edi, %k1
1141; CHECK-NEXT:    vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z}
1142; CHECK-NEXT:    retq
1143  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
1144  %b = bitcast i32 %mask to <32 x i1>
1145  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1146  ret <32 x half> %c
1147}
1148
1149define <32 x half> @fma_maskz_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1150; CHECK-LABEL: fma_maskz_213_v32f16:
1151; CHECK:       # %bb.0:
1152; CHECK-NEXT:    kmovd %edi, %k1
1153; CHECK-NEXT:    vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z}
1154; CHECK-NEXT:    retq
1155  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
1156  %b = bitcast i32 %mask to <32 x i1>
1157  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1158  ret <32 x half> %c
1159}
1160
1161define <32 x half> @fma_maskz_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1162; CHECK-LABEL: fma_maskz_231_v32f16:
1163; CHECK:       # %bb.0:
1164; CHECK-NEXT:    kmovd %edi, %k1
1165; CHECK-NEXT:    vfmadd231ph %zmm1, %zmm2, %zmm0 {%k1} {z}
1166; CHECK-NEXT:    retq
1167  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
1168  %b = bitcast i32 %mask to <32 x i1>
1169  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1170  ret <32 x half> %c
1171}
1172
1173define <32 x half> @fma_maskz_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1174; CHECK-LABEL: fma_maskz_321_v32f16:
1175; CHECK:       # %bb.0:
1176; CHECK-NEXT:    kmovd %edi, %k1
1177; CHECK-NEXT:    vfmadd231ph %zmm1, %zmm2, %zmm0 {%k1} {z}
1178; CHECK-NEXT:    retq
1179  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
1180  %b = bitcast i32 %mask to <32 x i1>
1181  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1182  ret <32 x half> %c
1183}
1184
1185define <32 x half> @fma_maskz_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1186; CHECK-LABEL: fma_maskz_132_v32f16:
1187; CHECK:       # %bb.0:
1188; CHECK-NEXT:    kmovd %edi, %k1
1189; CHECK-NEXT:    vfmadd213ph %zmm1, %zmm2, %zmm0 {%k1} {z}
1190; CHECK-NEXT:    retq
1191  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
1192  %b = bitcast i32 %mask to <32 x i1>
1193  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1194  ret <32 x half> %c
1195}
1196
1197define <32 x half> @fma_maskz_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) {
1198; CHECK-LABEL: fma_maskz_312_v32f16:
1199; CHECK:       # %bb.0:
1200; CHECK-NEXT:    kmovd %edi, %k1
1201; CHECK-NEXT:    vfmadd213ph %zmm1, %zmm2, %zmm0 {%k1} {z}
1202; CHECK-NEXT:    retq
1203  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
1204  %b = bitcast i32 %mask to <32 x i1>
1205  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1206  ret <32 x half> %c
1207}
1208
1209define <32 x half> @fma_mask_load_123_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1210; CHECK-LABEL: fma_mask_load_123_v32f16:
1211; CHECK:       # %bb.0:
1212; CHECK-NEXT:    kmovd %esi, %k1
1213; CHECK-NEXT:    vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1}
1214; CHECK-NEXT:    retq
1215  %z = load <32 x half>, ptr %zp
1216  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
1217  %b = bitcast i32 %mask to <32 x i1>
1218  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1219  ret <32 x half> %c
1220}
1221
1222define <32 x half> @fma_mask_load_213_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1223; CHECK-LABEL: fma_mask_load_213_v32f16:
1224; CHECK:       # %bb.0:
1225; CHECK-NEXT:    kmovd %esi, %k1
1226; CHECK-NEXT:    vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1}
1227; CHECK-NEXT:    retq
1228  %z = load <32 x half>, ptr %zp
1229  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
1230  %b = bitcast i32 %mask to <32 x i1>
1231  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1232  ret <32 x half> %c
1233}
1234
1235define <32 x half> @fma_mask_load_231_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1236; CHECK-LABEL: fma_mask_load_231_v32f16:
1237; CHECK:       # %bb.0:
1238; CHECK-NEXT:    kmovd %esi, %k1
1239; CHECK-NEXT:    vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1}
1240; CHECK-NEXT:    retq
1241  %z = load <32 x half>, ptr %zp
1242  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
1243  %b = bitcast i32 %mask to <32 x i1>
1244  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1245  ret <32 x half> %c
1246}
1247
1248define <32 x half> @fma_mask_load_321_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1249; CHECK-LABEL: fma_mask_load_321_v32f16:
1250; CHECK:       # %bb.0:
1251; CHECK-NEXT:    kmovd %esi, %k1
1252; CHECK-NEXT:    vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1}
1253; CHECK-NEXT:    retq
1254  %z = load <32 x half>, ptr %zp
1255  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
1256  %b = bitcast i32 %mask to <32 x i1>
1257  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1258  ret <32 x half> %c
1259}
1260
1261define <32 x half> @fma_mask_load_132_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1262; CHECK-LABEL: fma_mask_load_132_v32f16:
1263; CHECK:       # %bb.0:
1264; CHECK-NEXT:    kmovd %esi, %k1
1265; CHECK-NEXT:    vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1}
1266; CHECK-NEXT:    retq
1267  %z = load <32 x half>, ptr %zp
1268  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
1269  %b = bitcast i32 %mask to <32 x i1>
1270  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1271  ret <32 x half> %c
1272}
1273
1274define <32 x half> @fma_mask_load_312_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1275; CHECK-LABEL: fma_mask_load_312_v32f16:
1276; CHECK:       # %bb.0:
1277; CHECK-NEXT:    kmovd %esi, %k1
1278; CHECK-NEXT:    vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1}
1279; CHECK-NEXT:    retq
1280  %z = load <32 x half>, ptr %zp
1281  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
1282  %b = bitcast i32 %mask to <32 x i1>
1283  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x
1284  ret <32 x half> %c
1285}
1286
1287define <32 x half> @fma_maskz_load_123_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1288; CHECK-LABEL: fma_maskz_load_123_v32f16:
1289; CHECK:       # %bb.0:
1290; CHECK-NEXT:    kmovd %esi, %k1
1291; CHECK-NEXT:    vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1} {z}
1292; CHECK-NEXT:    retq
1293  %z = load <32 x half>, ptr %zp
1294  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z)
1295  %b = bitcast i32 %mask to <32 x i1>
1296  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1297  ret <32 x half> %c
1298}
1299
1300define <32 x half> @fma_maskz_load_213_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1301; CHECK-LABEL: fma_maskz_load_213_v32f16:
1302; CHECK:       # %bb.0:
1303; CHECK-NEXT:    kmovd %esi, %k1
1304; CHECK-NEXT:    vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1} {z}
1305; CHECK-NEXT:    retq
1306  %z = load <32 x half>, ptr %zp
1307  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z)
1308  %b = bitcast i32 %mask to <32 x i1>
1309  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1310  ret <32 x half> %c
1311}
1312
1313define <32 x half> @fma_maskz_load_231_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1314; CHECK-LABEL: fma_maskz_load_231_v32f16:
1315; CHECK:       # %bb.0:
1316; CHECK-NEXT:    kmovd %esi, %k1
1317; CHECK-NEXT:    vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1} {z}
1318; CHECK-NEXT:    retq
1319  %z = load <32 x half>, ptr %zp
1320  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x)
1321  %b = bitcast i32 %mask to <32 x i1>
1322  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1323  ret <32 x half> %c
1324}
1325
1326define <32 x half> @fma_maskz_load_321_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1327; CHECK-LABEL: fma_maskz_load_321_v32f16:
1328; CHECK:       # %bb.0:
1329; CHECK-NEXT:    kmovd %esi, %k1
1330; CHECK-NEXT:    vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1} {z}
1331; CHECK-NEXT:    retq
1332  %z = load <32 x half>, ptr %zp
1333  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x)
1334  %b = bitcast i32 %mask to <32 x i1>
1335  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1336  ret <32 x half> %c
1337}
1338
1339define <32 x half> @fma_maskz_load_132_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1340; CHECK-LABEL: fma_maskz_load_132_v32f16:
1341; CHECK:       # %bb.0:
1342; CHECK-NEXT:    kmovd %esi, %k1
1343; CHECK-NEXT:    vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1} {z}
1344; CHECK-NEXT:    retq
1345  %z = load <32 x half>, ptr %zp
1346  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y)
1347  %b = bitcast i32 %mask to <32 x i1>
1348  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1349  ret <32 x half> %c
1350}
1351
1352define <32 x half> @fma_maskz_load_312_v32f16(<32 x half> %x, <32 x half> %y, ptr %zp, i32 %mask) {
1353; CHECK-LABEL: fma_maskz_load_312_v32f16:
1354; CHECK:       # %bb.0:
1355; CHECK-NEXT:    kmovd %esi, %k1
1356; CHECK-NEXT:    vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1} {z}
1357; CHECK-NEXT:    retq
1358  %z = load <32 x half>, ptr %zp
1359  %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y)
1360  %b = bitcast i32 %mask to <32 x i1>
1361  %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer
1362  ret <32 x half> %c
1363}
1364