xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll (revision 79b69bf8c930036edc9ea09c0c334533ebbcda6f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <8 x half> @stack_fold_fmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
13; CHECK-LABEL: stack_fold_fmadd123ph:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16; CHECK-NEXT:    #APP
17; CHECK-NEXT:    nop
18; CHECK-NEXT:    #NO_APP
19; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
20; CHECK-NEXT:    retq
21  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
23  ret <8 x half> %2
24}
25declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
26
27define <8 x half> @stack_fold_fmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
28; CHECK-LABEL: stack_fold_fmadd213ph:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
31; CHECK-NEXT:    #APP
32; CHECK-NEXT:    nop
33; CHECK-NEXT:    #NO_APP
34; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
35; CHECK-NEXT:    retq
36  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
37  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2)
38  ret <8 x half> %2
39}
40
41define <8 x half> @stack_fold_fmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
42; CHECK-LABEL: stack_fold_fmadd231ph:
43; CHECK:       # %bb.0:
44; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
45; CHECK-NEXT:    #APP
46; CHECK-NEXT:    nop
47; CHECK-NEXT:    #NO_APP
48; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
49; CHECK-NEXT:    retq
50  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
51  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0)
52  ret <8 x half> %2
53}
54
55define <8 x half> @stack_fold_fmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
56; CHECK-LABEL: stack_fold_fmadd321ph:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
59; CHECK-NEXT:    #APP
60; CHECK-NEXT:    nop
61; CHECK-NEXT:    #NO_APP
62; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
63; CHECK-NEXT:    retq
64  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
65  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0)
66  ret <8 x half> %2
67}
68
69define <8 x half> @stack_fold_fmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
70; CHECK-LABEL: stack_fold_fmadd132ph:
71; CHECK:       # %bb.0:
72; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
73; CHECK-NEXT:    #APP
74; CHECK-NEXT:    nop
75; CHECK-NEXT:    #NO_APP
76; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
77; CHECK-NEXT:    retq
78  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
79  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1)
80  ret <8 x half> %2
81}
82
83define <8 x half> @stack_fold_fmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
84; CHECK-LABEL: stack_fold_fmadd312ph:
85; CHECK:       # %bb.0:
86; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
87; CHECK-NEXT:    #APP
88; CHECK-NEXT:    nop
89; CHECK-NEXT:    #NO_APP
90; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
91; CHECK-NEXT:    retq
92  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1)
94  ret <8 x half> %2
95}
96
97define <8 x half> @stack_fold_fmadd123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
98; CHECK-LABEL: stack_fold_fmadd123ph_mask:
99; CHECK:       # %bb.0:
100; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
101; CHECK-NEXT:    #APP
102; CHECK-NEXT:    nop
103; CHECK-NEXT:    #NO_APP
104; CHECK-NEXT:    vmovaps (%rdi), %xmm2
105; CHECK-NEXT:    kmovd %esi, %k1
106; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
107; CHECK-NEXT:    vmovaps %xmm2, %xmm0
108; CHECK-NEXT:    retq
109  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
110  %a0 = load <8 x half>, ptr %p
111  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
112  %3 = bitcast i8 %mask to <8 x i1>
113  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
114  ret <8 x half> %4
115}
116
117define <8 x half> @stack_fold_fmadd213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
118; CHECK-LABEL: stack_fold_fmadd213ph_mask:
119; CHECK:       # %bb.0:
120; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
121; CHECK-NEXT:    #APP
122; CHECK-NEXT:    nop
123; CHECK-NEXT:    #NO_APP
124; CHECK-NEXT:    vmovaps (%rdi), %xmm2
125; CHECK-NEXT:    kmovd %esi, %k1
126; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
127; CHECK-NEXT:    vmovaps %xmm2, %xmm0
128; CHECK-NEXT:    retq
129  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
130  %a0 = load <8 x half>, ptr %p
131  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2)
132  %3 = bitcast i8 %mask to <8 x i1>
133  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
134  ret <8 x half> %4
135}
136
137define <8 x half> @stack_fold_fmadd231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
138; CHECK-LABEL: stack_fold_fmadd231ph_mask:
139; CHECK:       # %bb.0:
140; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
141; CHECK-NEXT:    #APP
142; CHECK-NEXT:    nop
143; CHECK-NEXT:    #NO_APP
144; CHECK-NEXT:    vmovaps (%rdi), %xmm2
145; CHECK-NEXT:    kmovd %esi, %k1
146; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
147; CHECK-NEXT:    vmovaps %xmm2, %xmm0
148; CHECK-NEXT:    retq
149  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
150  %a0 = load <8 x half>, ptr %p
151  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0)
152  %3 = bitcast i8 %mask to <8 x i1>
153  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
154  ret <8 x half> %4
155}
156
157define <8 x half> @stack_fold_fmadd321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
158; CHECK-LABEL: stack_fold_fmadd321ph_mask:
159; CHECK:       # %bb.0:
160; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
161; CHECK-NEXT:    #APP
162; CHECK-NEXT:    nop
163; CHECK-NEXT:    #NO_APP
164; CHECK-NEXT:    vmovaps (%rdi), %xmm2
165; CHECK-NEXT:    kmovd %esi, %k1
166; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
167; CHECK-NEXT:    vmovaps %xmm2, %xmm0
168; CHECK-NEXT:    retq
169  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
170  %a0 = load <8 x half>, ptr %p
171  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0)
172  %3 = bitcast i8 %mask to <8 x i1>
173  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
174  ret <8 x half> %4
175}
176
177define <8 x half> @stack_fold_fmadd132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
178; CHECK-LABEL: stack_fold_fmadd132ph_mask:
179; CHECK:       # %bb.0:
180; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
181; CHECK-NEXT:    #APP
182; CHECK-NEXT:    nop
183; CHECK-NEXT:    #NO_APP
184; CHECK-NEXT:    vmovaps (%rdi), %xmm2
185; CHECK-NEXT:    kmovd %esi, %k1
186; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
187; CHECK-NEXT:    vmovaps %xmm2, %xmm0
188; CHECK-NEXT:    retq
189  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
190  %a0 = load <8 x half>, ptr %p
191  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1)
192  %3 = bitcast i8 %mask to <8 x i1>
193  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
194  ret <8 x half> %4
195}
196
197define <8 x half> @stack_fold_fmadd312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
198; CHECK-LABEL: stack_fold_fmadd312ph_mask:
199; CHECK:       # %bb.0:
200; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
201; CHECK-NEXT:    #APP
202; CHECK-NEXT:    nop
203; CHECK-NEXT:    #NO_APP
204; CHECK-NEXT:    vmovaps (%rdi), %xmm2
205; CHECK-NEXT:    kmovd %esi, %k1
206; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
207; CHECK-NEXT:    vmovaps %xmm2, %xmm0
208; CHECK-NEXT:    retq
209  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
210  %a0 = load <8 x half>, ptr %p
211  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1)
212  %3 = bitcast i8 %mask to <8 x i1>
213  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
214  ret <8 x half> %4
215}
216
217define <8 x half> @stack_fold_fmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
218; CHECK-LABEL: stack_fold_fmadd123ph_maskz:
219; CHECK:       # %bb.0:
220; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
221; CHECK-NEXT:    #APP
222; CHECK-NEXT:    nop
223; CHECK-NEXT:    #NO_APP
224; CHECK-NEXT:    kmovb (%rdi), %k1
225; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
226; CHECK-NEXT:    retq
227  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
228  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
229  %3 = load i8, ptr %mask
230  %4 = bitcast i8 %3 to <8 x i1>
231  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
232  ret <8 x half> %5
233}
234
235define <8 x half> @stack_fold_fmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
236; CHECK-LABEL: stack_fold_fmadd213ph_maskz:
237; CHECK:       # %bb.0:
238; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
239; CHECK-NEXT:    #APP
240; CHECK-NEXT:    nop
241; CHECK-NEXT:    #NO_APP
242; CHECK-NEXT:    kmovb (%rdi), %k1
243; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
244; CHECK-NEXT:    retq
245  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
246  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2)
247  %3 = load i8, ptr %mask
248  %4 = bitcast i8 %3 to <8 x i1>
249  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
250  ret <8 x half> %5
251}
252
253define <8 x half> @stack_fold_fmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
254; CHECK-LABEL: stack_fold_fmadd231ph_maskz:
255; CHECK:       # %bb.0:
256; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
257; CHECK-NEXT:    #APP
258; CHECK-NEXT:    nop
259; CHECK-NEXT:    #NO_APP
260; CHECK-NEXT:    kmovb (%rdi), %k1
261; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
262; CHECK-NEXT:    retq
263  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
264  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0)
265  %3 = load i8, ptr %mask
266  %4 = bitcast i8 %3 to <8 x i1>
267  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
268  ret <8 x half> %5
269}
270
271define <8 x half> @stack_fold_fmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
272; CHECK-LABEL: stack_fold_fmadd321ph_maskz:
273; CHECK:       # %bb.0:
274; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
275; CHECK-NEXT:    #APP
276; CHECK-NEXT:    nop
277; CHECK-NEXT:    #NO_APP
278; CHECK-NEXT:    kmovb (%rdi), %k1
279; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
280; CHECK-NEXT:    retq
281  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
282  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0)
283  %3 = load i8, ptr %mask
284  %4 = bitcast i8 %3 to <8 x i1>
285  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
286  ret <8 x half> %5
287}
288
289define <8 x half> @stack_fold_fmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
290; CHECK-LABEL: stack_fold_fmadd132ph_maskz:
291; CHECK:       # %bb.0:
292; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
293; CHECK-NEXT:    #APP
294; CHECK-NEXT:    nop
295; CHECK-NEXT:    #NO_APP
296; CHECK-NEXT:    kmovb (%rdi), %k1
297; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
298; CHECK-NEXT:    retq
299  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
300  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1)
301  %3 = load i8, ptr %mask
302  %4 = bitcast i8 %3 to <8 x i1>
303  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
304  ret <8 x half> %5
305}
306
307define <8 x half> @stack_fold_fmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
308; CHECK-LABEL: stack_fold_fmadd312ph_maskz:
309; CHECK:       # %bb.0:
310; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
311; CHECK-NEXT:    #APP
312; CHECK-NEXT:    nop
313; CHECK-NEXT:    #NO_APP
314; CHECK-NEXT:    kmovb (%rdi), %k1
315; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
316; CHECK-NEXT:    retq
317  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
318  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1)
319  %3 = load i8, ptr %mask
320  %4 = bitcast i8 %3 to <8 x i1>
321  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
322  ret <8 x half> %5
323}
324
325define <8 x half> @stack_fold_fmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
326; CHECK-LABEL: stack_fold_fmsub123ph:
327; CHECK:       # %bb.0:
328; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
329; CHECK-NEXT:    #APP
330; CHECK-NEXT:    nop
331; CHECK-NEXT:    #NO_APP
332; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
333; CHECK-NEXT:    retq
334  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
335  %2 = fneg <8 x half> %a2
336  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %2)
337  ret <8 x half> %3
338}
339
340define <8 x half> @stack_fold_fmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
341; CHECK-LABEL: stack_fold_fmsub213ph:
342; CHECK:       # %bb.0:
343; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
344; CHECK-NEXT:    #APP
345; CHECK-NEXT:    nop
346; CHECK-NEXT:    #NO_APP
347; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
348; CHECK-NEXT:    retq
349  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
350  %2 = fneg <8 x half> %a2
351  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %2)
352  ret <8 x half> %3
353}
354
355define <8 x half> @stack_fold_fmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
356; CHECK-LABEL: stack_fold_fmsub231ph:
357; CHECK:       # %bb.0:
358; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
359; CHECK-NEXT:    #APP
360; CHECK-NEXT:    nop
361; CHECK-NEXT:    #NO_APP
362; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
363; CHECK-NEXT:    retq
364  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
365  %2 = fneg <8 x half> %a0
366  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %2)
367  ret <8 x half> %3
368}
369
370define <8 x half> @stack_fold_fmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
371; CHECK-LABEL: stack_fold_fmsub321ph:
372; CHECK:       # %bb.0:
373; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
374; CHECK-NEXT:    #APP
375; CHECK-NEXT:    nop
376; CHECK-NEXT:    #NO_APP
377; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
378; CHECK-NEXT:    retq
379  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
380  %2 = fneg <8 x half> %a0
381  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %2)
382  ret <8 x half> %3
383}
384
385define <8 x half> @stack_fold_fmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
386; CHECK-LABEL: stack_fold_fmsub132ph:
387; CHECK:       # %bb.0:
388; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
389; CHECK-NEXT:    #APP
390; CHECK-NEXT:    nop
391; CHECK-NEXT:    #NO_APP
392; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
393; CHECK-NEXT:    retq
394  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
395  %2 = fneg <8 x half> %a1
396  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %2)
397  ret <8 x half> %3
398}
399
400define <8 x half> @stack_fold_fmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
401; CHECK-LABEL: stack_fold_fmsub312ph:
402; CHECK:       # %bb.0:
403; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
404; CHECK-NEXT:    #APP
405; CHECK-NEXT:    nop
406; CHECK-NEXT:    #NO_APP
407; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
408; CHECK-NEXT:    retq
409  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
410  %2 = fneg <8 x half> %a1
411  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %2)
412  ret <8 x half> %3
413}
414
415define <8 x half> @stack_fold_fmsub123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
416; CHECK-LABEL: stack_fold_fmsub123ph_mask:
417; CHECK:       # %bb.0:
418; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
419; CHECK-NEXT:    #APP
420; CHECK-NEXT:    nop
421; CHECK-NEXT:    #NO_APP
422; CHECK-NEXT:    vmovaps (%rdi), %xmm2
423; CHECK-NEXT:    kmovd %esi, %k1
424; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
425; CHECK-NEXT:    vmovaps %xmm2, %xmm0
426; CHECK-NEXT:    retq
427  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
428  %a0 = load <8 x half>, ptr %p
429  %neg = fneg <8 x half> %a2
430  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %neg)
431  %3 = bitcast i8 %mask to <8 x i1>
432  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
433  ret <8 x half> %4
434}
435
436define <8 x half> @stack_fold_fmsub213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
437; CHECK-LABEL: stack_fold_fmsub213ph_mask:
438; CHECK:       # %bb.0:
439; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
440; CHECK-NEXT:    #APP
441; CHECK-NEXT:    nop
442; CHECK-NEXT:    #NO_APP
443; CHECK-NEXT:    vmovaps (%rdi), %xmm2
444; CHECK-NEXT:    kmovd %esi, %k1
445; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
446; CHECK-NEXT:    vmovaps %xmm2, %xmm0
447; CHECK-NEXT:    retq
448  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
449  %a0 = load <8 x half>, ptr %p
450  %neg = fneg <8 x half> %a2
451  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %neg)
452  %3 = bitcast i8 %mask to <8 x i1>
453  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
454  ret <8 x half> %4
455}
456
457define <8 x half> @stack_fold_fmsub231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
458; CHECK-LABEL: stack_fold_fmsub231ph_mask:
459; CHECK:       # %bb.0:
460; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
461; CHECK-NEXT:    #APP
462; CHECK-NEXT:    nop
463; CHECK-NEXT:    #NO_APP
464; CHECK-NEXT:    vmovaps (%rdi), %xmm2
465; CHECK-NEXT:    kmovd %esi, %k1
466; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
467; CHECK-NEXT:    vmovaps %xmm2, %xmm0
468; CHECK-NEXT:    retq
469  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
470  %a0 = load <8 x half>, ptr %p
471  %neg = fneg <8 x half> %a0
472  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %neg)
473  %3 = bitcast i8 %mask to <8 x i1>
474  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
475  ret <8 x half> %4
476}
477
478define <8 x half> @stack_fold_fmsub321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
479; CHECK-LABEL: stack_fold_fmsub321ph_mask:
480; CHECK:       # %bb.0:
481; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
482; CHECK-NEXT:    #APP
483; CHECK-NEXT:    nop
484; CHECK-NEXT:    #NO_APP
485; CHECK-NEXT:    vmovaps (%rdi), %xmm2
486; CHECK-NEXT:    kmovd %esi, %k1
487; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
488; CHECK-NEXT:    vmovaps %xmm2, %xmm0
489; CHECK-NEXT:    retq
490  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
491  %a0 = load <8 x half>, ptr %p
492  %neg = fneg <8 x half> %a0
493  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %neg)
494  %3 = bitcast i8 %mask to <8 x i1>
495  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
496  ret <8 x half> %4
497}
498
499define <8 x half> @stack_fold_fmsub132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
500; CHECK-LABEL: stack_fold_fmsub132ph_mask:
501; CHECK:       # %bb.0:
502; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
503; CHECK-NEXT:    #APP
504; CHECK-NEXT:    nop
505; CHECK-NEXT:    #NO_APP
506; CHECK-NEXT:    vmovaps (%rdi), %xmm2
507; CHECK-NEXT:    kmovd %esi, %k1
508; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
509; CHECK-NEXT:    vmovaps %xmm2, %xmm0
510; CHECK-NEXT:    retq
511  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
512  %a0 = load <8 x half>, ptr %p
513  %neg = fneg <8 x half> %a1
514  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %neg)
515  %3 = bitcast i8 %mask to <8 x i1>
516  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
517  ret <8 x half> %4
518}
519
520define <8 x half> @stack_fold_fmsub312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
521; CHECK-LABEL: stack_fold_fmsub312ph_mask:
522; CHECK:       # %bb.0:
523; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
524; CHECK-NEXT:    #APP
525; CHECK-NEXT:    nop
526; CHECK-NEXT:    #NO_APP
527; CHECK-NEXT:    vmovaps (%rdi), %xmm2
528; CHECK-NEXT:    kmovd %esi, %k1
529; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
530; CHECK-NEXT:    vmovaps %xmm2, %xmm0
531; CHECK-NEXT:    retq
532  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
533  %a0 = load <8 x half>, ptr %p
534  %neg = fneg <8 x half> %a1
535  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %neg)
536  %3 = bitcast i8 %mask to <8 x i1>
537  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
538  ret <8 x half> %4
539}
540
541define <8 x half> @stack_fold_fmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
542; CHECK-LABEL: stack_fold_fmsub123ph_maskz:
543; CHECK:       # %bb.0:
544; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
545; CHECK-NEXT:    #APP
546; CHECK-NEXT:    nop
547; CHECK-NEXT:    #NO_APP
548; CHECK-NEXT:    kmovb (%rdi), %k1
549; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
550; CHECK-NEXT:    retq
551  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
552  %neg = fneg <8 x half> %a2
553  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %neg)
554  %3 = load i8, ptr %mask
555  %4 = bitcast i8 %3 to <8 x i1>
556  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
557  ret <8 x half> %5
558}
559
560define <8 x half> @stack_fold_fmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
561; CHECK-LABEL: stack_fold_fmsub213ph_maskz:
562; CHECK:       # %bb.0:
563; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
564; CHECK-NEXT:    #APP
565; CHECK-NEXT:    nop
566; CHECK-NEXT:    #NO_APP
567; CHECK-NEXT:    kmovb (%rdi), %k1
568; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
569; CHECK-NEXT:    retq
570  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
571  %neg = fneg <8 x half> %a2
572  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %neg)
573  %3 = load i8, ptr %mask
574  %4 = bitcast i8 %3 to <8 x i1>
575  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
576  ret <8 x half> %5
577}
578
579define <8 x half> @stack_fold_fmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
580; CHECK-LABEL: stack_fold_fmsub231ph_maskz:
581; CHECK:       # %bb.0:
582; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
583; CHECK-NEXT:    #APP
584; CHECK-NEXT:    nop
585; CHECK-NEXT:    #NO_APP
586; CHECK-NEXT:    kmovb (%rdi), %k1
587; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
588; CHECK-NEXT:    retq
589  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
590  %neg = fneg <8 x half> %a0
591  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %neg)
592  %3 = load i8, ptr %mask
593  %4 = bitcast i8 %3 to <8 x i1>
594  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
595  ret <8 x half> %5
596}
597
598define <8 x half> @stack_fold_fmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
599; CHECK-LABEL: stack_fold_fmsub321ph_maskz:
600; CHECK:       # %bb.0:
601; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
602; CHECK-NEXT:    #APP
603; CHECK-NEXT:    nop
604; CHECK-NEXT:    #NO_APP
605; CHECK-NEXT:    kmovb (%rdi), %k1
606; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
607; CHECK-NEXT:    retq
608  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
609  %neg = fneg <8 x half> %a0
610  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %neg)
611  %3 = load i8, ptr %mask
612  %4 = bitcast i8 %3 to <8 x i1>
613  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
614  ret <8 x half> %5
615}
616
617define <8 x half> @stack_fold_fmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
618; CHECK-LABEL: stack_fold_fmsub132ph_maskz:
619; CHECK:       # %bb.0:
620; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
621; CHECK-NEXT:    #APP
622; CHECK-NEXT:    nop
623; CHECK-NEXT:    #NO_APP
624; CHECK-NEXT:    kmovb (%rdi), %k1
625; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
626; CHECK-NEXT:    retq
627  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
628  %neg = fneg <8 x half> %a1
629  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %neg)
630  %3 = load i8, ptr %mask
631  %4 = bitcast i8 %3 to <8 x i1>
632  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
633  ret <8 x half> %5
634}
635
636define <8 x half> @stack_fold_fmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
637; CHECK-LABEL: stack_fold_fmsub312ph_maskz:
638; CHECK:       # %bb.0:
639; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
640; CHECK-NEXT:    #APP
641; CHECK-NEXT:    nop
642; CHECK-NEXT:    #NO_APP
643; CHECK-NEXT:    kmovb (%rdi), %k1
644; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
645; CHECK-NEXT:    retq
646  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
647  %neg = fneg <8 x half> %a1
648  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %neg)
649  %3 = load i8, ptr %mask
650  %4 = bitcast i8 %3 to <8 x i1>
651  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
652  ret <8 x half> %5
653}
654
655define <8 x half> @stack_fold_fnmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
656; CHECK-LABEL: stack_fold_fnmadd123ph:
657; CHECK:       # %bb.0:
658; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
659; CHECK-NEXT:    #APP
660; CHECK-NEXT:    nop
661; CHECK-NEXT:    #NO_APP
662; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
663; CHECK-NEXT:    retq
664  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
665  %2 = fneg <8 x half> %a0
666  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a2)
667  ret <8 x half> %3
668}
669
670define <8 x half> @stack_fold_fnmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
671; CHECK-LABEL: stack_fold_fnmadd213ph:
672; CHECK:       # %bb.0:
673; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
674; CHECK-NEXT:    #APP
675; CHECK-NEXT:    nop
676; CHECK-NEXT:    #NO_APP
677; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
678; CHECK-NEXT:    retq
679  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
680  %2 = fneg <8 x half> %a1
681  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a2)
682  ret <8 x half> %3
683}
684
685define <8 x half> @stack_fold_fnmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
686; CHECK-LABEL: stack_fold_fnmadd231ph:
687; CHECK:       # %bb.0:
688; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
689; CHECK-NEXT:    #APP
690; CHECK-NEXT:    nop
691; CHECK-NEXT:    #NO_APP
692; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
693; CHECK-NEXT:    retq
694  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
695  %2 = fneg <8 x half> %a1
696  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a0)
697  ret <8 x half> %3
698}
699
700define <8 x half> @stack_fold_fnmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
701; CHECK-LABEL: stack_fold_fnmadd321ph:
702; CHECK:       # %bb.0:
703; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
704; CHECK-NEXT:    #APP
705; CHECK-NEXT:    nop
706; CHECK-NEXT:    #NO_APP
707; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
708; CHECK-NEXT:    retq
709  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
710  %2 = fneg <8 x half> %a2
711  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a0)
712  ret <8 x half> %3
713}
714
715define <8 x half> @stack_fold_fnmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
716; CHECK-LABEL: stack_fold_fnmadd132ph:
717; CHECK:       # %bb.0:
718; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
719; CHECK-NEXT:    #APP
720; CHECK-NEXT:    nop
721; CHECK-NEXT:    #NO_APP
722; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
723; CHECK-NEXT:    retq
724  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
725  %2 = fneg <8 x half> %a0
726  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a1)
727  ret <8 x half> %3
728}
729
730define <8 x half> @stack_fold_fnmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
731; CHECK-LABEL: stack_fold_fnmadd312ph:
732; CHECK:       # %bb.0:
733; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
734; CHECK-NEXT:    #APP
735; CHECK-NEXT:    nop
736; CHECK-NEXT:    #NO_APP
737; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
738; CHECK-NEXT:    retq
739  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
740  %2 = fneg <8 x half> %a2
741  %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a1)
742  ret <8 x half> %3
743}
744
745define <8 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
746; CHECK-LABEL: stack_fold_fnmadd123ph_mask:
747; CHECK:       # %bb.0:
748; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
749; CHECK-NEXT:    #APP
750; CHECK-NEXT:    nop
751; CHECK-NEXT:    #NO_APP
752; CHECK-NEXT:    vmovaps (%rdi), %xmm2
753; CHECK-NEXT:    kmovd %esi, %k1
754; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
755; CHECK-NEXT:    vmovaps %xmm2, %xmm0
756; CHECK-NEXT:    retq
757  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
758  %a0 = load <8 x half>, ptr %p
759  %neg = fneg <8 x half> %a0
760  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a2)
761  %3 = bitcast i8 %mask to <8 x i1>
762  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
763  ret <8 x half> %4
764}
765
766define <8 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
767; CHECK-LABEL: stack_fold_fnmadd213ph_mask:
768; CHECK:       # %bb.0:
769; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
770; CHECK-NEXT:    #APP
771; CHECK-NEXT:    nop
772; CHECK-NEXT:    #NO_APP
773; CHECK-NEXT:    vmovaps (%rdi), %xmm2
774; CHECK-NEXT:    kmovd %esi, %k1
775; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
776; CHECK-NEXT:    vmovaps %xmm2, %xmm0
777; CHECK-NEXT:    retq
778  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
779  %a0 = load <8 x half>, ptr %p
780  %neg = fneg <8 x half> %a1
781  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a2)
782  %3 = bitcast i8 %mask to <8 x i1>
783  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
784  ret <8 x half> %4
785}
786
787define <8 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
788; CHECK-LABEL: stack_fold_fnmadd231ph_mask:
789; CHECK:       # %bb.0:
790; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
791; CHECK-NEXT:    #APP
792; CHECK-NEXT:    nop
793; CHECK-NEXT:    #NO_APP
794; CHECK-NEXT:    vmovaps (%rdi), %xmm2
795; CHECK-NEXT:    kmovd %esi, %k1
796; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
797; CHECK-NEXT:    vmovaps %xmm2, %xmm0
798; CHECK-NEXT:    retq
799  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
800  %a0 = load <8 x half>, ptr %p
801  %neg = fneg <8 x half> %a1
802  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a0)
803  %3 = bitcast i8 %mask to <8 x i1>
804  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
805  ret <8 x half> %4
806}
807
808define <8 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
809; CHECK-LABEL: stack_fold_fnmadd321ph_mask:
810; CHECK:       # %bb.0:
811; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
812; CHECK-NEXT:    #APP
813; CHECK-NEXT:    nop
814; CHECK-NEXT:    #NO_APP
815; CHECK-NEXT:    vmovaps (%rdi), %xmm2
816; CHECK-NEXT:    kmovd %esi, %k1
817; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
818; CHECK-NEXT:    vmovaps %xmm2, %xmm0
819; CHECK-NEXT:    retq
820  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
821  %a0 = load <8 x half>, ptr %p
822  %neg = fneg <8 x half> %a2
823  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a0)
824  %3 = bitcast i8 %mask to <8 x i1>
825  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
826  ret <8 x half> %4
827}
828
829define <8 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
830; CHECK-LABEL: stack_fold_fnmadd132ph_mask:
831; CHECK:       # %bb.0:
832; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
833; CHECK-NEXT:    #APP
834; CHECK-NEXT:    nop
835; CHECK-NEXT:    #NO_APP
836; CHECK-NEXT:    vmovaps (%rdi), %xmm2
837; CHECK-NEXT:    kmovd %esi, %k1
838; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
839; CHECK-NEXT:    vmovaps %xmm2, %xmm0
840; CHECK-NEXT:    retq
841  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
842  %a0 = load <8 x half>, ptr %p
843  %neg = fneg <8 x half> %a0
844  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a1)
845  %3 = bitcast i8 %mask to <8 x i1>
846  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
847  ret <8 x half> %4
848}
849
850define <8 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
851; CHECK-LABEL: stack_fold_fnmadd312ph_mask:
852; CHECK:       # %bb.0:
853; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
854; CHECK-NEXT:    #APP
855; CHECK-NEXT:    nop
856; CHECK-NEXT:    #NO_APP
857; CHECK-NEXT:    vmovaps (%rdi), %xmm2
858; CHECK-NEXT:    kmovd %esi, %k1
859; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
860; CHECK-NEXT:    vmovaps %xmm2, %xmm0
861; CHECK-NEXT:    retq
862  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
863  %a0 = load <8 x half>, ptr %p
864  %neg = fneg <8 x half> %a2
865  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a1)
866  %3 = bitcast i8 %mask to <8 x i1>
867  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
868  ret <8 x half> %4
869}
870
871define <8 x half> @stack_fold_fnmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
872; CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
873; CHECK:       # %bb.0:
874; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
875; CHECK-NEXT:    #APP
876; CHECK-NEXT:    nop
877; CHECK-NEXT:    #NO_APP
878; CHECK-NEXT:    kmovb (%rdi), %k1
879; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
880; CHECK-NEXT:    retq
881  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
882  %neg = fneg <8 x half> %a0
883  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a2)
884  %3 = load i8, ptr %mask
885  %4 = bitcast i8 %3 to <8 x i1>
886  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
887  ret <8 x half> %5
888}
889
890define <8 x half> @stack_fold_fnmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
891; CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
892; CHECK:       # %bb.0:
893; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
894; CHECK-NEXT:    #APP
895; CHECK-NEXT:    nop
896; CHECK-NEXT:    #NO_APP
897; CHECK-NEXT:    kmovb (%rdi), %k1
898; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
899; CHECK-NEXT:    retq
900  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
901  %neg = fneg <8 x half> %a1
902  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a2)
903  %3 = load i8, ptr %mask
904  %4 = bitcast i8 %3 to <8 x i1>
905  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
906  ret <8 x half> %5
907}
908
909define <8 x half> @stack_fold_fnmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
910; CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
911; CHECK:       # %bb.0:
912; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
913; CHECK-NEXT:    #APP
914; CHECK-NEXT:    nop
915; CHECK-NEXT:    #NO_APP
916; CHECK-NEXT:    kmovb (%rdi), %k1
917; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
918; CHECK-NEXT:    retq
919  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
920  %neg = fneg <8 x half> %a1
921  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a0)
922  %3 = load i8, ptr %mask
923  %4 = bitcast i8 %3 to <8 x i1>
924  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
925  ret <8 x half> %5
926}
927
928define <8 x half> @stack_fold_fnmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
929; CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
930; CHECK:       # %bb.0:
931; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
932; CHECK-NEXT:    #APP
933; CHECK-NEXT:    nop
934; CHECK-NEXT:    #NO_APP
935; CHECK-NEXT:    kmovb (%rdi), %k1
936; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
937; CHECK-NEXT:    retq
938  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
939  %neg = fneg <8 x half> %a2
940  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a0)
941  %3 = load i8, ptr %mask
942  %4 = bitcast i8 %3 to <8 x i1>
943  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
944  ret <8 x half> %5
945}
946
947define <8 x half> @stack_fold_fnmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
948; CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
949; CHECK:       # %bb.0:
950; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
951; CHECK-NEXT:    #APP
952; CHECK-NEXT:    nop
953; CHECK-NEXT:    #NO_APP
954; CHECK-NEXT:    kmovb (%rdi), %k1
955; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
956; CHECK-NEXT:    retq
957  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
958  %neg = fneg <8 x half> %a0
959  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a1)
960  %3 = load i8, ptr %mask
961  %4 = bitcast i8 %3 to <8 x i1>
962  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
963  ret <8 x half> %5
964}
965
966define <8 x half> @stack_fold_fnmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
967; CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
968; CHECK:       # %bb.0:
969; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
970; CHECK-NEXT:    #APP
971; CHECK-NEXT:    nop
972; CHECK-NEXT:    #NO_APP
973; CHECK-NEXT:    kmovb (%rdi), %k1
974; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
975; CHECK-NEXT:    retq
976  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
977  %neg = fneg <8 x half> %a2
978  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a1)
979  %3 = load i8, ptr %mask
980  %4 = bitcast i8 %3 to <8 x i1>
981  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
982  ret <8 x half> %5
983}
984
985define <8 x half> @stack_fold_fnmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
986; CHECK-LABEL: stack_fold_fnmsub123ph:
987; CHECK:       # %bb.0:
988; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
989; CHECK-NEXT:    #APP
990; CHECK-NEXT:    nop
991; CHECK-NEXT:    #NO_APP
992; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
993; CHECK-NEXT:    retq
994  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
995  %2 = fneg <8 x half> %a0
996  %3 = fneg <8 x half> %a2
997  %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %3)
998  ret <8 x half> %4
999}
1000
1001define <8 x half> @stack_fold_fnmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
1002; CHECK-LABEL: stack_fold_fnmsub213ph:
1003; CHECK:       # %bb.0:
1004; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1005; CHECK-NEXT:    #APP
1006; CHECK-NEXT:    nop
1007; CHECK-NEXT:    #NO_APP
1008; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1009; CHECK-NEXT:    retq
1010  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1011  %2 = fneg <8 x half> %a1
1012  %3 = fneg <8 x half> %a2
1013  %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %3)
1014  ret <8 x half> %4
1015}
1016
1017define <8 x half> @stack_fold_fnmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
1018; CHECK-LABEL: stack_fold_fnmsub231ph:
1019; CHECK:       # %bb.0:
1020; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1021; CHECK-NEXT:    #APP
1022; CHECK-NEXT:    nop
1023; CHECK-NEXT:    #NO_APP
1024; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1025; CHECK-NEXT:    retq
1026  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1027  %2 = fneg <8 x half> %a1
1028  %3 = fneg <8 x half> %a0
1029  %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %3)
1030  ret <8 x half> %4
1031}
1032
1033define <8 x half> @stack_fold_fnmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
1034; CHECK-LABEL: stack_fold_fnmsub321ph:
1035; CHECK:       # %bb.0:
1036; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1037; CHECK-NEXT:    #APP
1038; CHECK-NEXT:    nop
1039; CHECK-NEXT:    #NO_APP
1040; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1041; CHECK-NEXT:    retq
1042  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1043  %2 = fneg <8 x half> %a2
1044  %3 = fneg <8 x half> %a0
1045  %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %3)
1046  ret <8 x half> %4
1047}
1048
1049define <8 x half> @stack_fold_fnmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
1050; CHECK-LABEL: stack_fold_fnmsub132ph:
1051; CHECK:       # %bb.0:
1052; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1053; CHECK-NEXT:    #APP
1054; CHECK-NEXT:    nop
1055; CHECK-NEXT:    #NO_APP
1056; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1057; CHECK-NEXT:    retq
1058  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1059  %2 = fneg <8 x half> %a0
1060  %3 = fneg <8 x half> %a1
1061  %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %3)
1062  ret <8 x half> %4
1063}
1064
1065define <8 x half> @stack_fold_fnmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
1066; CHECK-LABEL: stack_fold_fnmsub312ph:
1067; CHECK:       # %bb.0:
1068; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1069; CHECK-NEXT:    #APP
1070; CHECK-NEXT:    nop
1071; CHECK-NEXT:    #NO_APP
1072; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1073; CHECK-NEXT:    retq
1074  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1075  %2 = fneg <8 x half> %a2
1076  %3 = fneg <8 x half> %a1
1077  %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %3)
1078  ret <8 x half> %4
1079}
1080
1081define <8 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
1082; CHECK-LABEL: stack_fold_fnmsub123ph_mask:
1083; CHECK:       # %bb.0:
1084; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1085; CHECK-NEXT:    #APP
1086; CHECK-NEXT:    nop
1087; CHECK-NEXT:    #NO_APP
1088; CHECK-NEXT:    vmovaps (%rdi), %xmm2
1089; CHECK-NEXT:    kmovd %esi, %k1
1090; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1091; CHECK-NEXT:    vmovaps %xmm2, %xmm0
1092; CHECK-NEXT:    retq
1093  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1094  %a0 = load <8 x half>, ptr %p
1095  %neg = fneg <8 x half> %a2
1096  %neg1 = fneg <8 x half> %a0
1097  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg)
1098  %3 = bitcast i8 %mask to <8 x i1>
1099  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
1100  ret <8 x half> %4
1101}
1102
1103define <8 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
1104; CHECK-LABEL: stack_fold_fnmsub213ph_mask:
1105; CHECK:       # %bb.0:
1106; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1107; CHECK-NEXT:    #APP
1108; CHECK-NEXT:    nop
1109; CHECK-NEXT:    #NO_APP
1110; CHECK-NEXT:    vmovaps (%rdi), %xmm2
1111; CHECK-NEXT:    kmovd %esi, %k1
1112; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1113; CHECK-NEXT:    vmovaps %xmm2, %xmm0
1114; CHECK-NEXT:    retq
1115  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1116  %a0 = load <8 x half>, ptr %p
1117  %neg = fneg <8 x half> %a2
1118  %neg1 = fneg <8 x half> %a1
1119  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg)
1120  %3 = bitcast i8 %mask to <8 x i1>
1121  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
1122  ret <8 x half> %4
1123}
1124
1125define <8 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
1126; CHECK-LABEL: stack_fold_fnmsub231ph_mask:
1127; CHECK:       # %bb.0:
1128; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1129; CHECK-NEXT:    #APP
1130; CHECK-NEXT:    nop
1131; CHECK-NEXT:    #NO_APP
1132; CHECK-NEXT:    vmovaps (%rdi), %xmm2
1133; CHECK-NEXT:    kmovd %esi, %k1
1134; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1135; CHECK-NEXT:    vmovaps %xmm2, %xmm0
1136; CHECK-NEXT:    retq
1137  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1138  %a0 = load <8 x half>, ptr %p
1139  %neg = fneg <8 x half> %a0
1140  %neg1 = fneg <8 x half> %a1
1141  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg)
1142  %3 = bitcast i8 %mask to <8 x i1>
1143  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
1144  ret <8 x half> %4
1145}
1146
1147define <8 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
1148; CHECK-LABEL: stack_fold_fnmsub321ph_mask:
1149; CHECK:       # %bb.0:
1150; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1151; CHECK-NEXT:    #APP
1152; CHECK-NEXT:    nop
1153; CHECK-NEXT:    #NO_APP
1154; CHECK-NEXT:    vmovaps (%rdi), %xmm2
1155; CHECK-NEXT:    kmovd %esi, %k1
1156; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1157; CHECK-NEXT:    vmovaps %xmm2, %xmm0
1158; CHECK-NEXT:    retq
1159  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1160  %a0 = load <8 x half>, ptr %p
1161  %neg = fneg <8 x half> %a0
1162  %neg1 = fneg <8 x half> %a2
1163  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg)
1164  %3 = bitcast i8 %mask to <8 x i1>
1165  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
1166  ret <8 x half> %4
1167}
1168
1169define <8 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
1170; CHECK-LABEL: stack_fold_fnmsub132ph_mask:
1171; CHECK:       # %bb.0:
1172; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1173; CHECK-NEXT:    #APP
1174; CHECK-NEXT:    nop
1175; CHECK-NEXT:    #NO_APP
1176; CHECK-NEXT:    vmovaps (%rdi), %xmm2
1177; CHECK-NEXT:    kmovd %esi, %k1
1178; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1179; CHECK-NEXT:    vmovaps %xmm2, %xmm0
1180; CHECK-NEXT:    retq
1181  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1182  %a0 = load <8 x half>, ptr %p
1183  %neg = fneg <8 x half> %a1
1184  %neg1 = fneg <8 x half> %a0
1185  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg)
1186  %3 = bitcast i8 %mask to <8 x i1>
1187  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
1188  ret <8 x half> %4
1189}
1190
1191define <8 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
1192; CHECK-LABEL: stack_fold_fnmsub312ph_mask:
1193; CHECK:       # %bb.0:
1194; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1195; CHECK-NEXT:    #APP
1196; CHECK-NEXT:    nop
1197; CHECK-NEXT:    #NO_APP
1198; CHECK-NEXT:    vmovaps (%rdi), %xmm2
1199; CHECK-NEXT:    kmovd %esi, %k1
1200; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1201; CHECK-NEXT:    vmovaps %xmm2, %xmm0
1202; CHECK-NEXT:    retq
1203  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1204  %a0 = load <8 x half>, ptr %p
1205  %neg = fneg <8 x half> %a1
1206  %neg1 = fneg <8 x half> %a2
1207  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg)
1208  %3 = bitcast i8 %mask to <8 x i1>
1209  %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0
1210  ret <8 x half> %4
1211}
1212
1213define <8 x half> @stack_fold_fnmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
1214; CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
1215; CHECK:       # %bb.0:
1216; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1217; CHECK-NEXT:    #APP
1218; CHECK-NEXT:    nop
1219; CHECK-NEXT:    #NO_APP
1220; CHECK-NEXT:    kmovb (%rdi), %k1
1221; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1222; CHECK-NEXT:    retq
1223  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1224  %neg = fneg <8 x half> %a2
1225  %neg1 = fneg <8 x half> %a0
1226  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg)
1227  %3 = load i8, ptr %mask
1228  %4 = bitcast i8 %3 to <8 x i1>
1229  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
1230  ret <8 x half> %5
1231}
1232
1233define <8 x half> @stack_fold_fnmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
1234; CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
1235; CHECK:       # %bb.0:
1236; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1237; CHECK-NEXT:    #APP
1238; CHECK-NEXT:    nop
1239; CHECK-NEXT:    #NO_APP
1240; CHECK-NEXT:    kmovb (%rdi), %k1
1241; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1242; CHECK-NEXT:    retq
1243  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1244  %neg = fneg <8 x half> %a2
1245  %neg1 = fneg <8 x half> %a1
1246  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg)
1247  %3 = load i8, ptr %mask
1248  %4 = bitcast i8 %3 to <8 x i1>
1249  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
1250  ret <8 x half> %5
1251}
1252
1253define <8 x half> @stack_fold_fnmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
1254; CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
1255; CHECK:       # %bb.0:
1256; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1257; CHECK-NEXT:    #APP
1258; CHECK-NEXT:    nop
1259; CHECK-NEXT:    #NO_APP
1260; CHECK-NEXT:    kmovb (%rdi), %k1
1261; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1262; CHECK-NEXT:    retq
1263  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1264  %neg = fneg <8 x half> %a0
1265  %neg1 = fneg <8 x half> %a1
1266  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg)
1267  %3 = load i8, ptr %mask
1268  %4 = bitcast i8 %3 to <8 x i1>
1269  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
1270  ret <8 x half> %5
1271}
1272
1273define <8 x half> @stack_fold_fnmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
1274; CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
1275; CHECK:       # %bb.0:
1276; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1277; CHECK-NEXT:    #APP
1278; CHECK-NEXT:    nop
1279; CHECK-NEXT:    #NO_APP
1280; CHECK-NEXT:    kmovb (%rdi), %k1
1281; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1282; CHECK-NEXT:    retq
1283  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1284  %neg = fneg <8 x half> %a0
1285  %neg1 = fneg <8 x half> %a2
1286  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg)
1287  %3 = load i8, ptr %mask
1288  %4 = bitcast i8 %3 to <8 x i1>
1289  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
1290  ret <8 x half> %5
1291}
1292
1293define <8 x half> @stack_fold_fnmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
1294; CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
1295; CHECK:       # %bb.0:
1296; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1297; CHECK-NEXT:    #APP
1298; CHECK-NEXT:    nop
1299; CHECK-NEXT:    #NO_APP
1300; CHECK-NEXT:    kmovb (%rdi), %k1
1301; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1302; CHECK-NEXT:    retq
1303  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1304  %neg = fneg <8 x half> %a1
1305  %neg1 = fneg <8 x half> %a0
1306  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg)
1307  %3 = load i8, ptr %mask
1308  %4 = bitcast i8 %3 to <8 x i1>
1309  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
1310  ret <8 x half> %5
1311}
1312
1313define <8 x half> @stack_fold_fnmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
1314; CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
1315; CHECK:       # %bb.0:
1316; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1317; CHECK-NEXT:    #APP
1318; CHECK-NEXT:    nop
1319; CHECK-NEXT:    #NO_APP
1320; CHECK-NEXT:    kmovb (%rdi), %k1
1321; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1322; CHECK-NEXT:    retq
1323  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1324  %neg = fneg <8 x half> %a1
1325  %neg1 = fneg <8 x half> %a2
1326  %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg)
1327  %3 = load i8, ptr %mask
1328  %4 = bitcast i8 %3 to <8 x i1>
1329  %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer
1330  ret <8 x half> %5
1331}
1332
1333define <16 x half> @stack_fold_fmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1334; CHECK-LABEL: stack_fold_fmadd123ph_ymm:
1335; CHECK:       # %bb.0:
1336; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1337; CHECK-NEXT:    #APP
1338; CHECK-NEXT:    nop
1339; CHECK-NEXT:    #NO_APP
1340; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1341; CHECK-NEXT:    retq
1342  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1343  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2)
1344  ret <16 x half> %2
1345}
1346declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
1347
1348define <16 x half> @stack_fold_fmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1349; CHECK-LABEL: stack_fold_fmadd213ph_ymm:
1350; CHECK:       # %bb.0:
1351; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1352; CHECK-NEXT:    #APP
1353; CHECK-NEXT:    nop
1354; CHECK-NEXT:    #NO_APP
1355; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1356; CHECK-NEXT:    retq
1357  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1358  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2)
1359  ret <16 x half> %2
1360}
1361
1362define <16 x half> @stack_fold_fmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1363; CHECK-LABEL: stack_fold_fmadd231ph_ymm:
1364; CHECK:       # %bb.0:
1365; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1366; CHECK-NEXT:    #APP
1367; CHECK-NEXT:    nop
1368; CHECK-NEXT:    #NO_APP
1369; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1370; CHECK-NEXT:    retq
1371  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1372  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0)
1373  ret <16 x half> %2
1374}
1375
1376define <16 x half> @stack_fold_fmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1377; CHECK-LABEL: stack_fold_fmadd321ph_ymm:
1378; CHECK:       # %bb.0:
1379; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1380; CHECK-NEXT:    #APP
1381; CHECK-NEXT:    nop
1382; CHECK-NEXT:    #NO_APP
1383; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1384; CHECK-NEXT:    retq
1385  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1386  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0)
1387  ret <16 x half> %2
1388}
1389
1390define <16 x half> @stack_fold_fmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1391; CHECK-LABEL: stack_fold_fmadd132ph_ymm:
1392; CHECK:       # %bb.0:
1393; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1394; CHECK-NEXT:    #APP
1395; CHECK-NEXT:    nop
1396; CHECK-NEXT:    #NO_APP
1397; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1398; CHECK-NEXT:    retq
1399  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1400  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1)
1401  ret <16 x half> %2
1402}
1403
1404define <16 x half> @stack_fold_fmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1405; CHECK-LABEL: stack_fold_fmadd312ph_ymm:
1406; CHECK:       # %bb.0:
1407; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1408; CHECK-NEXT:    #APP
1409; CHECK-NEXT:    nop
1410; CHECK-NEXT:    #NO_APP
1411; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1412; CHECK-NEXT:    retq
1413  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1414  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1)
1415  ret <16 x half> %2
1416}
1417
1418define <16 x half> @stack_fold_fmadd123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1419; CHECK-LABEL: stack_fold_fmadd123ph_mask_ymm:
1420; CHECK:       # %bb.0:
1421; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1422; CHECK-NEXT:    #APP
1423; CHECK-NEXT:    nop
1424; CHECK-NEXT:    #NO_APP
1425; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1426; CHECK-NEXT:    kmovd %esi, %k1
1427; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1428; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1429; CHECK-NEXT:    retq
1430  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1431  %a0 = load <16 x half>, ptr %p
1432  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2)
1433  %3 = bitcast i16 %mask to <16 x i1>
1434  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1435  ret <16 x half> %4
1436}
1437
1438define <16 x half> @stack_fold_fmadd213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1439; CHECK-LABEL: stack_fold_fmadd213ph_mask_ymm:
1440; CHECK:       # %bb.0:
1441; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1442; CHECK-NEXT:    #APP
1443; CHECK-NEXT:    nop
1444; CHECK-NEXT:    #NO_APP
1445; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1446; CHECK-NEXT:    kmovd %esi, %k1
1447; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1448; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1449; CHECK-NEXT:    retq
1450  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1451  %a0 = load <16 x half>, ptr %p
1452  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2)
1453  %3 = bitcast i16 %mask to <16 x i1>
1454  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1455  ret <16 x half> %4
1456}
1457
1458define <16 x half> @stack_fold_fmadd231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1459; CHECK-LABEL: stack_fold_fmadd231ph_mask_ymm:
1460; CHECK:       # %bb.0:
1461; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1462; CHECK-NEXT:    #APP
1463; CHECK-NEXT:    nop
1464; CHECK-NEXT:    #NO_APP
1465; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1466; CHECK-NEXT:    kmovd %esi, %k1
1467; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1468; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1469; CHECK-NEXT:    retq
1470  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1471  %a0 = load <16 x half>, ptr %p
1472  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0)
1473  %3 = bitcast i16 %mask to <16 x i1>
1474  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1475  ret <16 x half> %4
1476}
1477
1478define <16 x half> @stack_fold_fmadd321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1479; CHECK-LABEL: stack_fold_fmadd321ph_mask_ymm:
1480; CHECK:       # %bb.0:
1481; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1482; CHECK-NEXT:    #APP
1483; CHECK-NEXT:    nop
1484; CHECK-NEXT:    #NO_APP
1485; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1486; CHECK-NEXT:    kmovd %esi, %k1
1487; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1488; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1489; CHECK-NEXT:    retq
1490  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1491  %a0 = load <16 x half>, ptr %p
1492  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0)
1493  %3 = bitcast i16 %mask to <16 x i1>
1494  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1495  ret <16 x half> %4
1496}
1497
1498define <16 x half> @stack_fold_fmadd132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1499; CHECK-LABEL: stack_fold_fmadd132ph_mask_ymm:
1500; CHECK:       # %bb.0:
1501; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1502; CHECK-NEXT:    #APP
1503; CHECK-NEXT:    nop
1504; CHECK-NEXT:    #NO_APP
1505; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1506; CHECK-NEXT:    kmovd %esi, %k1
1507; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1508; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1509; CHECK-NEXT:    retq
1510  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1511  %a0 = load <16 x half>, ptr %p
1512  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1)
1513  %3 = bitcast i16 %mask to <16 x i1>
1514  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1515  ret <16 x half> %4
1516}
1517
1518define <16 x half> @stack_fold_fmadd312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1519; CHECK-LABEL: stack_fold_fmadd312ph_mask_ymm:
1520; CHECK:       # %bb.0:
1521; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1522; CHECK-NEXT:    #APP
1523; CHECK-NEXT:    nop
1524; CHECK-NEXT:    #NO_APP
1525; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1526; CHECK-NEXT:    kmovd %esi, %k1
1527; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1528; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1529; CHECK-NEXT:    retq
1530  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1531  %a0 = load <16 x half>, ptr %p
1532  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1)
1533  %3 = bitcast i16 %mask to <16 x i1>
1534  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1535  ret <16 x half> %4
1536}
1537
1538define <16 x half> @stack_fold_fmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1539; CHECK-LABEL: stack_fold_fmadd123ph_maskz_ymm:
1540; CHECK:       # %bb.0:
1541; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1542; CHECK-NEXT:    #APP
1543; CHECK-NEXT:    nop
1544; CHECK-NEXT:    #NO_APP
1545; CHECK-NEXT:    kmovw (%rdi), %k1
1546; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1547; CHECK-NEXT:    retq
1548  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1549  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2)
1550  %3 = load i16, ptr %mask
1551  %4 = bitcast i16 %3 to <16 x i1>
1552  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1553  ret <16 x half> %5
1554}
1555
1556define <16 x half> @stack_fold_fmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1557; CHECK-LABEL: stack_fold_fmadd213ph_maskz_ymm:
1558; CHECK:       # %bb.0:
1559; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1560; CHECK-NEXT:    #APP
1561; CHECK-NEXT:    nop
1562; CHECK-NEXT:    #NO_APP
1563; CHECK-NEXT:    kmovw (%rdi), %k1
1564; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1565; CHECK-NEXT:    retq
1566  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1567  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2)
1568  %3 = load i16, ptr %mask
1569  %4 = bitcast i16 %3 to <16 x i1>
1570  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1571  ret <16 x half> %5
1572}
1573
1574define <16 x half> @stack_fold_fmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1575; CHECK-LABEL: stack_fold_fmadd231ph_maskz_ymm:
1576; CHECK:       # %bb.0:
1577; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1578; CHECK-NEXT:    #APP
1579; CHECK-NEXT:    nop
1580; CHECK-NEXT:    #NO_APP
1581; CHECK-NEXT:    kmovw (%rdi), %k1
1582; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1583; CHECK-NEXT:    retq
1584  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1585  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0)
1586  %3 = load i16, ptr %mask
1587  %4 = bitcast i16 %3 to <16 x i1>
1588  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1589  ret <16 x half> %5
1590}
1591
1592define <16 x half> @stack_fold_fmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1593; CHECK-LABEL: stack_fold_fmadd321ph_maskz_ymm:
1594; CHECK:       # %bb.0:
1595; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1596; CHECK-NEXT:    #APP
1597; CHECK-NEXT:    nop
1598; CHECK-NEXT:    #NO_APP
1599; CHECK-NEXT:    kmovw (%rdi), %k1
1600; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1601; CHECK-NEXT:    retq
1602  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1603  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0)
1604  %3 = load i16, ptr %mask
1605  %4 = bitcast i16 %3 to <16 x i1>
1606  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1607  ret <16 x half> %5
1608}
1609
1610define <16 x half> @stack_fold_fmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1611; CHECK-LABEL: stack_fold_fmadd132ph_maskz_ymm:
1612; CHECK:       # %bb.0:
1613; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1614; CHECK-NEXT:    #APP
1615; CHECK-NEXT:    nop
1616; CHECK-NEXT:    #NO_APP
1617; CHECK-NEXT:    kmovw (%rdi), %k1
1618; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1619; CHECK-NEXT:    retq
1620  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1621  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1)
1622  %3 = load i16, ptr %mask
1623  %4 = bitcast i16 %3 to <16 x i1>
1624  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1625  ret <16 x half> %5
1626}
1627
1628define <16 x half> @stack_fold_fmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1629; CHECK-LABEL: stack_fold_fmadd312ph_maskz_ymm:
1630; CHECK:       # %bb.0:
1631; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1632; CHECK-NEXT:    #APP
1633; CHECK-NEXT:    nop
1634; CHECK-NEXT:    #NO_APP
1635; CHECK-NEXT:    kmovw (%rdi), %k1
1636; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1637; CHECK-NEXT:    retq
1638  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1639  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1)
1640  %3 = load i16, ptr %mask
1641  %4 = bitcast i16 %3 to <16 x i1>
1642  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1643  ret <16 x half> %5
1644}
1645
1646define <16 x half> @stack_fold_fmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1647; CHECK-LABEL: stack_fold_fmsub123ph_ymm:
1648; CHECK:       # %bb.0:
1649; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1650; CHECK-NEXT:    #APP
1651; CHECK-NEXT:    nop
1652; CHECK-NEXT:    #NO_APP
1653; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1654; CHECK-NEXT:    retq
1655  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1656  %2 = fneg <16 x half> %a2
1657  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %2)
1658  ret <16 x half> %3
1659}
1660
1661define <16 x half> @stack_fold_fmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1662; CHECK-LABEL: stack_fold_fmsub213ph_ymm:
1663; CHECK:       # %bb.0:
1664; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1665; CHECK-NEXT:    #APP
1666; CHECK-NEXT:    nop
1667; CHECK-NEXT:    #NO_APP
1668; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1669; CHECK-NEXT:    retq
1670  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1671  %2 = fneg <16 x half> %a2
1672  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %2)
1673  ret <16 x half> %3
1674}
1675
1676define <16 x half> @stack_fold_fmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1677; CHECK-LABEL: stack_fold_fmsub231ph_ymm:
1678; CHECK:       # %bb.0:
1679; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1680; CHECK-NEXT:    #APP
1681; CHECK-NEXT:    nop
1682; CHECK-NEXT:    #NO_APP
1683; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1684; CHECK-NEXT:    retq
1685  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1686  %2 = fneg <16 x half> %a0
1687  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %2)
1688  ret <16 x half> %3
1689}
1690
1691define <16 x half> @stack_fold_fmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1692; CHECK-LABEL: stack_fold_fmsub321ph_ymm:
1693; CHECK:       # %bb.0:
1694; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1695; CHECK-NEXT:    #APP
1696; CHECK-NEXT:    nop
1697; CHECK-NEXT:    #NO_APP
1698; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1699; CHECK-NEXT:    retq
1700  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1701  %2 = fneg <16 x half> %a0
1702  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %2)
1703  ret <16 x half> %3
1704}
1705
1706define <16 x half> @stack_fold_fmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1707; CHECK-LABEL: stack_fold_fmsub132ph_ymm:
1708; CHECK:       # %bb.0:
1709; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1710; CHECK-NEXT:    #APP
1711; CHECK-NEXT:    nop
1712; CHECK-NEXT:    #NO_APP
1713; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1714; CHECK-NEXT:    retq
1715  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1716  %2 = fneg <16 x half> %a1
1717  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %2)
1718  ret <16 x half> %3
1719}
1720
1721define <16 x half> @stack_fold_fmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1722; CHECK-LABEL: stack_fold_fmsub312ph_ymm:
1723; CHECK:       # %bb.0:
1724; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1725; CHECK-NEXT:    #APP
1726; CHECK-NEXT:    nop
1727; CHECK-NEXT:    #NO_APP
1728; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1729; CHECK-NEXT:    retq
1730  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1731  %2 = fneg <16 x half> %a1
1732  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %2)
1733  ret <16 x half> %3
1734}
1735
1736define <16 x half> @stack_fold_fmsub123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1737; CHECK-LABEL: stack_fold_fmsub123ph_mask_ymm:
1738; CHECK:       # %bb.0:
1739; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1740; CHECK-NEXT:    #APP
1741; CHECK-NEXT:    nop
1742; CHECK-NEXT:    #NO_APP
1743; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1744; CHECK-NEXT:    kmovd %esi, %k1
1745; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1746; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1747; CHECK-NEXT:    retq
1748  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1749  %a0 = load <16 x half>, ptr %p
1750  %neg = fneg <16 x half> %a2
1751  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %neg)
1752  %3 = bitcast i16 %mask to <16 x i1>
1753  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1754  ret <16 x half> %4
1755}
1756
1757define <16 x half> @stack_fold_fmsub213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1758; CHECK-LABEL: stack_fold_fmsub213ph_mask_ymm:
1759; CHECK:       # %bb.0:
1760; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1761; CHECK-NEXT:    #APP
1762; CHECK-NEXT:    nop
1763; CHECK-NEXT:    #NO_APP
1764; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1765; CHECK-NEXT:    kmovd %esi, %k1
1766; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1767; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1768; CHECK-NEXT:    retq
1769  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1770  %a0 = load <16 x half>, ptr %p
1771  %neg = fneg <16 x half> %a2
1772  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %neg)
1773  %3 = bitcast i16 %mask to <16 x i1>
1774  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1775  ret <16 x half> %4
1776}
1777
1778define <16 x half> @stack_fold_fmsub231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1779; CHECK-LABEL: stack_fold_fmsub231ph_mask_ymm:
1780; CHECK:       # %bb.0:
1781; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1782; CHECK-NEXT:    #APP
1783; CHECK-NEXT:    nop
1784; CHECK-NEXT:    #NO_APP
1785; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1786; CHECK-NEXT:    kmovd %esi, %k1
1787; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1788; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1789; CHECK-NEXT:    retq
1790  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1791  %a0 = load <16 x half>, ptr %p
1792  %neg = fneg <16 x half> %a0
1793  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %neg)
1794  %3 = bitcast i16 %mask to <16 x i1>
1795  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1796  ret <16 x half> %4
1797}
1798
1799define <16 x half> @stack_fold_fmsub321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1800; CHECK-LABEL: stack_fold_fmsub321ph_mask_ymm:
1801; CHECK:       # %bb.0:
1802; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1803; CHECK-NEXT:    #APP
1804; CHECK-NEXT:    nop
1805; CHECK-NEXT:    #NO_APP
1806; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1807; CHECK-NEXT:    kmovd %esi, %k1
1808; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1809; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1810; CHECK-NEXT:    retq
1811  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1812  %a0 = load <16 x half>, ptr %p
1813  %neg = fneg <16 x half> %a0
1814  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %neg)
1815  %3 = bitcast i16 %mask to <16 x i1>
1816  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1817  ret <16 x half> %4
1818}
1819
1820define <16 x half> @stack_fold_fmsub132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1821; CHECK-LABEL: stack_fold_fmsub132ph_mask_ymm:
1822; CHECK:       # %bb.0:
1823; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1824; CHECK-NEXT:    #APP
1825; CHECK-NEXT:    nop
1826; CHECK-NEXT:    #NO_APP
1827; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1828; CHECK-NEXT:    kmovd %esi, %k1
1829; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1830; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1831; CHECK-NEXT:    retq
1832  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1833  %a0 = load <16 x half>, ptr %p
1834  %neg = fneg <16 x half> %a1
1835  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %neg)
1836  %3 = bitcast i16 %mask to <16 x i1>
1837  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1838  ret <16 x half> %4
1839}
1840
1841define <16 x half> @stack_fold_fmsub312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
1842; CHECK-LABEL: stack_fold_fmsub312ph_mask_ymm:
1843; CHECK:       # %bb.0:
1844; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1845; CHECK-NEXT:    #APP
1846; CHECK-NEXT:    nop
1847; CHECK-NEXT:    #NO_APP
1848; CHECK-NEXT:    vmovaps (%rdi), %ymm2
1849; CHECK-NEXT:    kmovd %esi, %k1
1850; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1851; CHECK-NEXT:    vmovaps %ymm2, %ymm0
1852; CHECK-NEXT:    retq
1853  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1854  %a0 = load <16 x half>, ptr %p
1855  %neg = fneg <16 x half> %a1
1856  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %neg)
1857  %3 = bitcast i16 %mask to <16 x i1>
1858  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
1859  ret <16 x half> %4
1860}
1861
1862define <16 x half> @stack_fold_fmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1863; CHECK-LABEL: stack_fold_fmsub123ph_maskz_ymm:
1864; CHECK:       # %bb.0:
1865; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1866; CHECK-NEXT:    #APP
1867; CHECK-NEXT:    nop
1868; CHECK-NEXT:    #NO_APP
1869; CHECK-NEXT:    kmovw (%rdi), %k1
1870; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1871; CHECK-NEXT:    retq
1872  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1873  %neg = fneg <16 x half> %a2
1874  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %neg)
1875  %3 = load i16, ptr %mask
1876  %4 = bitcast i16 %3 to <16 x i1>
1877  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1878  ret <16 x half> %5
1879}
1880
1881define <16 x half> @stack_fold_fmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1882; CHECK-LABEL: stack_fold_fmsub213ph_maskz_ymm:
1883; CHECK:       # %bb.0:
1884; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1885; CHECK-NEXT:    #APP
1886; CHECK-NEXT:    nop
1887; CHECK-NEXT:    #NO_APP
1888; CHECK-NEXT:    kmovw (%rdi), %k1
1889; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1890; CHECK-NEXT:    retq
1891  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1892  %neg = fneg <16 x half> %a2
1893  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %neg)
1894  %3 = load i16, ptr %mask
1895  %4 = bitcast i16 %3 to <16 x i1>
1896  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1897  ret <16 x half> %5
1898}
1899
1900define <16 x half> @stack_fold_fmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1901; CHECK-LABEL: stack_fold_fmsub231ph_maskz_ymm:
1902; CHECK:       # %bb.0:
1903; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1904; CHECK-NEXT:    #APP
1905; CHECK-NEXT:    nop
1906; CHECK-NEXT:    #NO_APP
1907; CHECK-NEXT:    kmovw (%rdi), %k1
1908; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1909; CHECK-NEXT:    retq
1910  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1911  %neg = fneg <16 x half> %a0
1912  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %neg)
1913  %3 = load i16, ptr %mask
1914  %4 = bitcast i16 %3 to <16 x i1>
1915  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1916  ret <16 x half> %5
1917}
1918
1919define <16 x half> @stack_fold_fmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1920; CHECK-LABEL: stack_fold_fmsub321ph_maskz_ymm:
1921; CHECK:       # %bb.0:
1922; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1923; CHECK-NEXT:    #APP
1924; CHECK-NEXT:    nop
1925; CHECK-NEXT:    #NO_APP
1926; CHECK-NEXT:    kmovw (%rdi), %k1
1927; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1928; CHECK-NEXT:    retq
1929  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1930  %neg = fneg <16 x half> %a0
1931  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %neg)
1932  %3 = load i16, ptr %mask
1933  %4 = bitcast i16 %3 to <16 x i1>
1934  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1935  ret <16 x half> %5
1936}
1937
1938define <16 x half> @stack_fold_fmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1939; CHECK-LABEL: stack_fold_fmsub132ph_maskz_ymm:
1940; CHECK:       # %bb.0:
1941; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1942; CHECK-NEXT:    #APP
1943; CHECK-NEXT:    nop
1944; CHECK-NEXT:    #NO_APP
1945; CHECK-NEXT:    kmovw (%rdi), %k1
1946; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1947; CHECK-NEXT:    retq
1948  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1949  %neg = fneg <16 x half> %a1
1950  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %neg)
1951  %3 = load i16, ptr %mask
1952  %4 = bitcast i16 %3 to <16 x i1>
1953  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1954  ret <16 x half> %5
1955}
1956
1957define <16 x half> @stack_fold_fmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
1958; CHECK-LABEL: stack_fold_fmsub312ph_maskz_ymm:
1959; CHECK:       # %bb.0:
1960; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1961; CHECK-NEXT:    #APP
1962; CHECK-NEXT:    nop
1963; CHECK-NEXT:    #NO_APP
1964; CHECK-NEXT:    kmovw (%rdi), %k1
1965; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1966; CHECK-NEXT:    retq
1967  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1968  %neg = fneg <16 x half> %a1
1969  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %neg)
1970  %3 = load i16, ptr %mask
1971  %4 = bitcast i16 %3 to <16 x i1>
1972  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
1973  ret <16 x half> %5
1974}
1975
1976define <16 x half> @stack_fold_fnmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1977; CHECK-LABEL: stack_fold_fnmadd123ph_ymm:
1978; CHECK:       # %bb.0:
1979; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1980; CHECK-NEXT:    #APP
1981; CHECK-NEXT:    nop
1982; CHECK-NEXT:    #NO_APP
1983; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1984; CHECK-NEXT:    retq
1985  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1986  %2 = fneg <16 x half> %a0
1987  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a2)
1988  ret <16 x half> %3
1989}
1990
1991define <16 x half> @stack_fold_fnmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
1992; CHECK-LABEL: stack_fold_fnmadd213ph_ymm:
1993; CHECK:       # %bb.0:
1994; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1995; CHECK-NEXT:    #APP
1996; CHECK-NEXT:    nop
1997; CHECK-NEXT:    #NO_APP
1998; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1999; CHECK-NEXT:    retq
2000  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2001  %2 = fneg <16 x half> %a1
2002  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a2)
2003  ret <16 x half> %3
2004}
2005
2006define <16 x half> @stack_fold_fnmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2007; CHECK-LABEL: stack_fold_fnmadd231ph_ymm:
2008; CHECK:       # %bb.0:
2009; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2010; CHECK-NEXT:    #APP
2011; CHECK-NEXT:    nop
2012; CHECK-NEXT:    #NO_APP
2013; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2014; CHECK-NEXT:    retq
2015  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2016  %2 = fneg <16 x half> %a1
2017  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a0)
2018  ret <16 x half> %3
2019}
2020
2021define <16 x half> @stack_fold_fnmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2022; CHECK-LABEL: stack_fold_fnmadd321ph_ymm:
2023; CHECK:       # %bb.0:
2024; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2025; CHECK-NEXT:    #APP
2026; CHECK-NEXT:    nop
2027; CHECK-NEXT:    #NO_APP
2028; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2029; CHECK-NEXT:    retq
2030  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2031  %2 = fneg <16 x half> %a2
2032  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a0)
2033  ret <16 x half> %3
2034}
2035
2036define <16 x half> @stack_fold_fnmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2037; CHECK-LABEL: stack_fold_fnmadd132ph_ymm:
2038; CHECK:       # %bb.0:
2039; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2040; CHECK-NEXT:    #APP
2041; CHECK-NEXT:    nop
2042; CHECK-NEXT:    #NO_APP
2043; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2044; CHECK-NEXT:    retq
2045  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2046  %2 = fneg <16 x half> %a0
2047  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a1)
2048  ret <16 x half> %3
2049}
2050
2051define <16 x half> @stack_fold_fnmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2052; CHECK-LABEL: stack_fold_fnmadd312ph_ymm:
2053; CHECK:       # %bb.0:
2054; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2055; CHECK-NEXT:    #APP
2056; CHECK-NEXT:    nop
2057; CHECK-NEXT:    #NO_APP
2058; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2059; CHECK-NEXT:    retq
2060  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2061  %2 = fneg <16 x half> %a2
2062  %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a1)
2063  ret <16 x half> %3
2064}
2065
2066define <16 x half> @stack_fold_fnmadd123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2067; CHECK-LABEL: stack_fold_fnmadd123ph_mask_ymm:
2068; CHECK:       # %bb.0:
2069; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2070; CHECK-NEXT:    #APP
2071; CHECK-NEXT:    nop
2072; CHECK-NEXT:    #NO_APP
2073; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2074; CHECK-NEXT:    kmovd %esi, %k1
2075; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2076; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2077; CHECK-NEXT:    retq
2078  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2079  %a0 = load <16 x half>, ptr %p
2080  %neg = fneg <16 x half> %a0
2081  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a2)
2082  %3 = bitcast i16 %mask to <16 x i1>
2083  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2084  ret <16 x half> %4
2085}
2086
2087define <16 x half> @stack_fold_fnmadd213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2088; CHECK-LABEL: stack_fold_fnmadd213ph_mask_ymm:
2089; CHECK:       # %bb.0:
2090; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2091; CHECK-NEXT:    #APP
2092; CHECK-NEXT:    nop
2093; CHECK-NEXT:    #NO_APP
2094; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2095; CHECK-NEXT:    kmovd %esi, %k1
2096; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2097; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2098; CHECK-NEXT:    retq
2099  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2100  %a0 = load <16 x half>, ptr %p
2101  %neg = fneg <16 x half> %a1
2102  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a2)
2103  %3 = bitcast i16 %mask to <16 x i1>
2104  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2105  ret <16 x half> %4
2106}
2107
2108define <16 x half> @stack_fold_fnmadd231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2109; CHECK-LABEL: stack_fold_fnmadd231ph_mask_ymm:
2110; CHECK:       # %bb.0:
2111; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2112; CHECK-NEXT:    #APP
2113; CHECK-NEXT:    nop
2114; CHECK-NEXT:    #NO_APP
2115; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2116; CHECK-NEXT:    kmovd %esi, %k1
2117; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2118; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2119; CHECK-NEXT:    retq
2120  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2121  %a0 = load <16 x half>, ptr %p
2122  %neg = fneg <16 x half> %a1
2123  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a0)
2124  %3 = bitcast i16 %mask to <16 x i1>
2125  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2126  ret <16 x half> %4
2127}
2128
2129define <16 x half> @stack_fold_fnmadd321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2130; CHECK-LABEL: stack_fold_fnmadd321ph_mask_ymm:
2131; CHECK:       # %bb.0:
2132; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2133; CHECK-NEXT:    #APP
2134; CHECK-NEXT:    nop
2135; CHECK-NEXT:    #NO_APP
2136; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2137; CHECK-NEXT:    kmovd %esi, %k1
2138; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2139; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2140; CHECK-NEXT:    retq
2141  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2142  %a0 = load <16 x half>, ptr %p
2143  %neg = fneg <16 x half> %a2
2144  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a0)
2145  %3 = bitcast i16 %mask to <16 x i1>
2146  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2147  ret <16 x half> %4
2148}
2149
2150define <16 x half> @stack_fold_fnmadd132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2151; CHECK-LABEL: stack_fold_fnmadd132ph_mask_ymm:
2152; CHECK:       # %bb.0:
2153; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2154; CHECK-NEXT:    #APP
2155; CHECK-NEXT:    nop
2156; CHECK-NEXT:    #NO_APP
2157; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2158; CHECK-NEXT:    kmovd %esi, %k1
2159; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2160; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2161; CHECK-NEXT:    retq
2162  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2163  %a0 = load <16 x half>, ptr %p
2164  %neg = fneg <16 x half> %a0
2165  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a1)
2166  %3 = bitcast i16 %mask to <16 x i1>
2167  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2168  ret <16 x half> %4
2169}
2170
2171define <16 x half> @stack_fold_fnmadd312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2172; CHECK-LABEL: stack_fold_fnmadd312ph_mask_ymm:
2173; CHECK:       # %bb.0:
2174; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2175; CHECK-NEXT:    #APP
2176; CHECK-NEXT:    nop
2177; CHECK-NEXT:    #NO_APP
2178; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2179; CHECK-NEXT:    kmovd %esi, %k1
2180; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2181; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2182; CHECK-NEXT:    retq
2183  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2184  %a0 = load <16 x half>, ptr %p
2185  %neg = fneg <16 x half> %a2
2186  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a1)
2187  %3 = bitcast i16 %mask to <16 x i1>
2188  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2189  ret <16 x half> %4
2190}
2191
2192define <16 x half> @stack_fold_fnmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2193; CHECK-LABEL: stack_fold_fnmadd123ph_maskz_ymm:
2194; CHECK:       # %bb.0:
2195; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2196; CHECK-NEXT:    #APP
2197; CHECK-NEXT:    nop
2198; CHECK-NEXT:    #NO_APP
2199; CHECK-NEXT:    kmovw (%rdi), %k1
2200; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2201; CHECK-NEXT:    retq
2202  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2203  %neg = fneg <16 x half> %a0
2204  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a2)
2205  %3 = load i16, ptr %mask
2206  %4 = bitcast i16 %3 to <16 x i1>
2207  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2208  ret <16 x half> %5
2209}
2210
2211define <16 x half> @stack_fold_fnmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2212; CHECK-LABEL: stack_fold_fnmadd213ph_maskz_ymm:
2213; CHECK:       # %bb.0:
2214; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2215; CHECK-NEXT:    #APP
2216; CHECK-NEXT:    nop
2217; CHECK-NEXT:    #NO_APP
2218; CHECK-NEXT:    kmovw (%rdi), %k1
2219; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2220; CHECK-NEXT:    retq
2221  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2222  %neg = fneg <16 x half> %a1
2223  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a2)
2224  %3 = load i16, ptr %mask
2225  %4 = bitcast i16 %3 to <16 x i1>
2226  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2227  ret <16 x half> %5
2228}
2229
2230define <16 x half> @stack_fold_fnmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2231; CHECK-LABEL: stack_fold_fnmadd231ph_maskz_ymm:
2232; CHECK:       # %bb.0:
2233; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2234; CHECK-NEXT:    #APP
2235; CHECK-NEXT:    nop
2236; CHECK-NEXT:    #NO_APP
2237; CHECK-NEXT:    kmovw (%rdi), %k1
2238; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2239; CHECK-NEXT:    retq
2240  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2241  %neg = fneg <16 x half> %a1
2242  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a0)
2243  %3 = load i16, ptr %mask
2244  %4 = bitcast i16 %3 to <16 x i1>
2245  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2246  ret <16 x half> %5
2247}
2248
2249define <16 x half> @stack_fold_fnmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2250; CHECK-LABEL: stack_fold_fnmadd321ph_maskz_ymm:
2251; CHECK:       # %bb.0:
2252; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2253; CHECK-NEXT:    #APP
2254; CHECK-NEXT:    nop
2255; CHECK-NEXT:    #NO_APP
2256; CHECK-NEXT:    kmovw (%rdi), %k1
2257; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2258; CHECK-NEXT:    retq
2259  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2260  %neg = fneg <16 x half> %a2
2261  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a0)
2262  %3 = load i16, ptr %mask
2263  %4 = bitcast i16 %3 to <16 x i1>
2264  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2265  ret <16 x half> %5
2266}
2267
2268define <16 x half> @stack_fold_fnmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2269; CHECK-LABEL: stack_fold_fnmadd132ph_maskz_ymm:
2270; CHECK:       # %bb.0:
2271; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2272; CHECK-NEXT:    #APP
2273; CHECK-NEXT:    nop
2274; CHECK-NEXT:    #NO_APP
2275; CHECK-NEXT:    kmovw (%rdi), %k1
2276; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2277; CHECK-NEXT:    retq
2278  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2279  %neg = fneg <16 x half> %a0
2280  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a1)
2281  %3 = load i16, ptr %mask
2282  %4 = bitcast i16 %3 to <16 x i1>
2283  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2284  ret <16 x half> %5
2285}
2286
2287define <16 x half> @stack_fold_fnmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2288; CHECK-LABEL: stack_fold_fnmadd312ph_maskz_ymm:
2289; CHECK:       # %bb.0:
2290; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2291; CHECK-NEXT:    #APP
2292; CHECK-NEXT:    nop
2293; CHECK-NEXT:    #NO_APP
2294; CHECK-NEXT:    kmovw (%rdi), %k1
2295; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2296; CHECK-NEXT:    retq
2297  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2298  %neg = fneg <16 x half> %a2
2299  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a1)
2300  %3 = load i16, ptr %mask
2301  %4 = bitcast i16 %3 to <16 x i1>
2302  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2303  ret <16 x half> %5
2304}
2305
2306define <16 x half> @stack_fold_fnmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2307; CHECK-LABEL: stack_fold_fnmsub123ph_ymm:
2308; CHECK:       # %bb.0:
2309; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2310; CHECK-NEXT:    #APP
2311; CHECK-NEXT:    nop
2312; CHECK-NEXT:    #NO_APP
2313; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2314; CHECK-NEXT:    retq
2315  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2316  %2 = fneg <16 x half> %a0
2317  %3 = fneg <16 x half> %a2
2318  %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %3)
2319  ret <16 x half> %4
2320}
2321
2322define <16 x half> @stack_fold_fnmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2323; CHECK-LABEL: stack_fold_fnmsub213ph_ymm:
2324; CHECK:       # %bb.0:
2325; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2326; CHECK-NEXT:    #APP
2327; CHECK-NEXT:    nop
2328; CHECK-NEXT:    #NO_APP
2329; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2330; CHECK-NEXT:    retq
2331  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2332  %2 = fneg <16 x half> %a1
2333  %3 = fneg <16 x half> %a2
2334  %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %3)
2335  ret <16 x half> %4
2336}
2337
2338define <16 x half> @stack_fold_fnmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2339; CHECK-LABEL: stack_fold_fnmsub231ph_ymm:
2340; CHECK:       # %bb.0:
2341; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2342; CHECK-NEXT:    #APP
2343; CHECK-NEXT:    nop
2344; CHECK-NEXT:    #NO_APP
2345; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2346; CHECK-NEXT:    retq
2347  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2348  %2 = fneg <16 x half> %a1
2349  %3 = fneg <16 x half> %a0
2350  %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %3)
2351  ret <16 x half> %4
2352}
2353
2354define <16 x half> @stack_fold_fnmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2355; CHECK-LABEL: stack_fold_fnmsub321ph_ymm:
2356; CHECK:       # %bb.0:
2357; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2358; CHECK-NEXT:    #APP
2359; CHECK-NEXT:    nop
2360; CHECK-NEXT:    #NO_APP
2361; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2362; CHECK-NEXT:    retq
2363  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2364  %2 = fneg <16 x half> %a2
2365  %3 = fneg <16 x half> %a0
2366  %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %3)
2367  ret <16 x half> %4
2368}
2369
2370define <16 x half> @stack_fold_fnmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2371; CHECK-LABEL: stack_fold_fnmsub132ph_ymm:
2372; CHECK:       # %bb.0:
2373; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2374; CHECK-NEXT:    #APP
2375; CHECK-NEXT:    nop
2376; CHECK-NEXT:    #NO_APP
2377; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2378; CHECK-NEXT:    retq
2379  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2380  %2 = fneg <16 x half> %a0
2381  %3 = fneg <16 x half> %a1
2382  %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %3)
2383  ret <16 x half> %4
2384}
2385
2386define <16 x half> @stack_fold_fnmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
2387; CHECK-LABEL: stack_fold_fnmsub312ph_ymm:
2388; CHECK:       # %bb.0:
2389; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2390; CHECK-NEXT:    #APP
2391; CHECK-NEXT:    nop
2392; CHECK-NEXT:    #NO_APP
2393; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
2394; CHECK-NEXT:    retq
2395  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2396  %2 = fneg <16 x half> %a2
2397  %3 = fneg <16 x half> %a1
2398  %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %3)
2399  ret <16 x half> %4
2400}
2401
2402define <16 x half> @stack_fold_fnmsub123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2403; CHECK-LABEL: stack_fold_fnmsub123ph_mask_ymm:
2404; CHECK:       # %bb.0:
2405; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2406; CHECK-NEXT:    #APP
2407; CHECK-NEXT:    nop
2408; CHECK-NEXT:    #NO_APP
2409; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2410; CHECK-NEXT:    kmovd %esi, %k1
2411; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2412; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2413; CHECK-NEXT:    retq
2414  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2415  %a0 = load <16 x half>, ptr %p
2416  %neg = fneg <16 x half> %a2
2417  %neg1 = fneg <16 x half> %a0
2418  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg)
2419  %3 = bitcast i16 %mask to <16 x i1>
2420  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2421  ret <16 x half> %4
2422}
2423
2424define <16 x half> @stack_fold_fnmsub213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2425; CHECK-LABEL: stack_fold_fnmsub213ph_mask_ymm:
2426; CHECK:       # %bb.0:
2427; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2428; CHECK-NEXT:    #APP
2429; CHECK-NEXT:    nop
2430; CHECK-NEXT:    #NO_APP
2431; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2432; CHECK-NEXT:    kmovd %esi, %k1
2433; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2434; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2435; CHECK-NEXT:    retq
2436  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2437  %a0 = load <16 x half>, ptr %p
2438  %neg = fneg <16 x half> %a2
2439  %neg1 = fneg <16 x half> %a1
2440  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg)
2441  %3 = bitcast i16 %mask to <16 x i1>
2442  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2443  ret <16 x half> %4
2444}
2445
2446define <16 x half> @stack_fold_fnmsub231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2447; CHECK-LABEL: stack_fold_fnmsub231ph_mask_ymm:
2448; CHECK:       # %bb.0:
2449; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2450; CHECK-NEXT:    #APP
2451; CHECK-NEXT:    nop
2452; CHECK-NEXT:    #NO_APP
2453; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2454; CHECK-NEXT:    kmovd %esi, %k1
2455; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2456; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2457; CHECK-NEXT:    retq
2458  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2459  %a0 = load <16 x half>, ptr %p
2460  %neg = fneg <16 x half> %a0
2461  %neg1 = fneg <16 x half> %a1
2462  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg)
2463  %3 = bitcast i16 %mask to <16 x i1>
2464  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2465  ret <16 x half> %4
2466}
2467
2468define <16 x half> @stack_fold_fnmsub321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2469; CHECK-LABEL: stack_fold_fnmsub321ph_mask_ymm:
2470; CHECK:       # %bb.0:
2471; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2472; CHECK-NEXT:    #APP
2473; CHECK-NEXT:    nop
2474; CHECK-NEXT:    #NO_APP
2475; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2476; CHECK-NEXT:    kmovd %esi, %k1
2477; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2478; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2479; CHECK-NEXT:    retq
2480  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2481  %a0 = load <16 x half>, ptr %p
2482  %neg = fneg <16 x half> %a0
2483  %neg1 = fneg <16 x half> %a2
2484  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg)
2485  %3 = bitcast i16 %mask to <16 x i1>
2486  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2487  ret <16 x half> %4
2488}
2489
2490define <16 x half> @stack_fold_fnmsub132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2491; CHECK-LABEL: stack_fold_fnmsub132ph_mask_ymm:
2492; CHECK:       # %bb.0:
2493; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2494; CHECK-NEXT:    #APP
2495; CHECK-NEXT:    nop
2496; CHECK-NEXT:    #NO_APP
2497; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2498; CHECK-NEXT:    kmovd %esi, %k1
2499; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2500; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2501; CHECK-NEXT:    retq
2502  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2503  %a0 = load <16 x half>, ptr %p
2504  %neg = fneg <16 x half> %a1
2505  %neg1 = fneg <16 x half> %a0
2506  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg)
2507  %3 = bitcast i16 %mask to <16 x i1>
2508  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2509  ret <16 x half> %4
2510}
2511
2512define <16 x half> @stack_fold_fnmsub312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
2513; CHECK-LABEL: stack_fold_fnmsub312ph_mask_ymm:
2514; CHECK:       # %bb.0:
2515; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2516; CHECK-NEXT:    #APP
2517; CHECK-NEXT:    nop
2518; CHECK-NEXT:    #NO_APP
2519; CHECK-NEXT:    vmovaps (%rdi), %ymm2
2520; CHECK-NEXT:    kmovd %esi, %k1
2521; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
2522; CHECK-NEXT:    vmovaps %ymm2, %ymm0
2523; CHECK-NEXT:    retq
2524  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2525  %a0 = load <16 x half>, ptr %p
2526  %neg = fneg <16 x half> %a1
2527  %neg1 = fneg <16 x half> %a2
2528  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg)
2529  %3 = bitcast i16 %mask to <16 x i1>
2530  %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0
2531  ret <16 x half> %4
2532}
2533
2534define <16 x half> @stack_fold_fnmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2535; CHECK-LABEL: stack_fold_fnmsub123ph_maskz_ymm:
2536; CHECK:       # %bb.0:
2537; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2538; CHECK-NEXT:    #APP
2539; CHECK-NEXT:    nop
2540; CHECK-NEXT:    #NO_APP
2541; CHECK-NEXT:    kmovw (%rdi), %k1
2542; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2543; CHECK-NEXT:    retq
2544  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2545  %neg = fneg <16 x half> %a2
2546  %neg1 = fneg <16 x half> %a0
2547  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg)
2548  %3 = load i16, ptr %mask
2549  %4 = bitcast i16 %3 to <16 x i1>
2550  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2551  ret <16 x half> %5
2552}
2553
2554define <16 x half> @stack_fold_fnmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2555; CHECK-LABEL: stack_fold_fnmsub213ph_maskz_ymm:
2556; CHECK:       # %bb.0:
2557; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2558; CHECK-NEXT:    #APP
2559; CHECK-NEXT:    nop
2560; CHECK-NEXT:    #NO_APP
2561; CHECK-NEXT:    kmovw (%rdi), %k1
2562; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2563; CHECK-NEXT:    retq
2564  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2565  %neg = fneg <16 x half> %a2
2566  %neg1 = fneg <16 x half> %a1
2567  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg)
2568  %3 = load i16, ptr %mask
2569  %4 = bitcast i16 %3 to <16 x i1>
2570  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2571  ret <16 x half> %5
2572}
2573
2574define <16 x half> @stack_fold_fnmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2575; CHECK-LABEL: stack_fold_fnmsub231ph_maskz_ymm:
2576; CHECK:       # %bb.0:
2577; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2578; CHECK-NEXT:    #APP
2579; CHECK-NEXT:    nop
2580; CHECK-NEXT:    #NO_APP
2581; CHECK-NEXT:    kmovw (%rdi), %k1
2582; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2583; CHECK-NEXT:    retq
2584  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2585  %neg = fneg <16 x half> %a0
2586  %neg1 = fneg <16 x half> %a1
2587  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg)
2588  %3 = load i16, ptr %mask
2589  %4 = bitcast i16 %3 to <16 x i1>
2590  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2591  ret <16 x half> %5
2592}
2593
2594define <16 x half> @stack_fold_fnmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2595; CHECK-LABEL: stack_fold_fnmsub321ph_maskz_ymm:
2596; CHECK:       # %bb.0:
2597; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2598; CHECK-NEXT:    #APP
2599; CHECK-NEXT:    nop
2600; CHECK-NEXT:    #NO_APP
2601; CHECK-NEXT:    kmovw (%rdi), %k1
2602; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2603; CHECK-NEXT:    retq
2604  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2605  %neg = fneg <16 x half> %a0
2606  %neg1 = fneg <16 x half> %a2
2607  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg)
2608  %3 = load i16, ptr %mask
2609  %4 = bitcast i16 %3 to <16 x i1>
2610  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2611  ret <16 x half> %5
2612}
2613
2614define <16 x half> @stack_fold_fnmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2615; CHECK-LABEL: stack_fold_fnmsub132ph_maskz_ymm:
2616; CHECK:       # %bb.0:
2617; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2618; CHECK-NEXT:    #APP
2619; CHECK-NEXT:    nop
2620; CHECK-NEXT:    #NO_APP
2621; CHECK-NEXT:    kmovw (%rdi), %k1
2622; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2623; CHECK-NEXT:    retq
2624  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2625  %neg = fneg <16 x half> %a1
2626  %neg1 = fneg <16 x half> %a0
2627  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg)
2628  %3 = load i16, ptr %mask
2629  %4 = bitcast i16 %3 to <16 x i1>
2630  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2631  ret <16 x half> %5
2632}
2633
2634define <16 x half> @stack_fold_fnmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
2635; CHECK-LABEL: stack_fold_fnmsub312ph_maskz_ymm:
2636; CHECK:       # %bb.0:
2637; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2638; CHECK-NEXT:    #APP
2639; CHECK-NEXT:    nop
2640; CHECK-NEXT:    #NO_APP
2641; CHECK-NEXT:    kmovw (%rdi), %k1
2642; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
2643; CHECK-NEXT:    retq
2644  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2645  %neg = fneg <16 x half> %a1
2646  %neg1 = fneg <16 x half> %a2
2647  %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg)
2648  %3 = load i16, ptr %mask
2649  %4 = bitcast i16 %3 to <16 x i1>
2650  %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer
2651  ret <16 x half> %5
2652}
2653