xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll (revision 79b69bf8c930036edc9ea09c0c334533ebbcda6f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <32 x half> @stack_fold_fmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
13; CHECK-LABEL: stack_fold_fmadd123ph:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16; CHECK-NEXT:    #APP
17; CHECK-NEXT:    nop
18; CHECK-NEXT:    #NO_APP
19; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
20; CHECK-NEXT:    retq
21  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
23  ret <32 x half> %2
24}
25declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
26
27define <32 x half> @stack_fold_fmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
28; CHECK-LABEL: stack_fold_fmadd213ph:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
31; CHECK-NEXT:    #APP
32; CHECK-NEXT:    nop
33; CHECK-NEXT:    #NO_APP
34; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
35; CHECK-NEXT:    retq
36  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
37  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
38  ret <32 x half> %2
39}
40
41define <32 x half> @stack_fold_fmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
42; CHECK-LABEL: stack_fold_fmadd231ph:
43; CHECK:       # %bb.0:
44; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
45; CHECK-NEXT:    #APP
46; CHECK-NEXT:    nop
47; CHECK-NEXT:    #NO_APP
48; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
49; CHECK-NEXT:    retq
50  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
51  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
52  ret <32 x half> %2
53}
54
55define <32 x half> @stack_fold_fmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
56; CHECK-LABEL: stack_fold_fmadd321ph:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
59; CHECK-NEXT:    #APP
60; CHECK-NEXT:    nop
61; CHECK-NEXT:    #NO_APP
62; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
63; CHECK-NEXT:    retq
64  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
65  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
66  ret <32 x half> %2
67}
68
69define <32 x half> @stack_fold_fmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
70; CHECK-LABEL: stack_fold_fmadd132ph:
71; CHECK:       # %bb.0:
72; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
73; CHECK-NEXT:    #APP
74; CHECK-NEXT:    nop
75; CHECK-NEXT:    #NO_APP
76; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
77; CHECK-NEXT:    retq
78  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
79  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
80  ret <32 x half> %2
81}
82
83define <32 x half> @stack_fold_fmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
84; CHECK-LABEL: stack_fold_fmadd312ph:
85; CHECK:       # %bb.0:
86; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
87; CHECK-NEXT:    #APP
88; CHECK-NEXT:    nop
89; CHECK-NEXT:    #NO_APP
90; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
91; CHECK-NEXT:    retq
92  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
94  ret <32 x half> %2
95}
96
97define <32 x half> @stack_fold_fmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
98; CHECK-LABEL: stack_fold_fmadd123ph_mask:
99; CHECK:       # %bb.0:
100; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
101; CHECK-NEXT:    #APP
102; CHECK-NEXT:    nop
103; CHECK-NEXT:    #NO_APP
104; CHECK-NEXT:    vmovaps (%rdi), %zmm2
105; CHECK-NEXT:    kmovd %esi, %k1
106; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
107; CHECK-NEXT:    vmovaps %zmm2, %zmm0
108; CHECK-NEXT:    retq
109  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
110  %a0 = load <32 x half>, ptr %p
111  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
112  %3 = bitcast i32 %mask to <32 x i1>
113  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
114  ret <32 x half> %4
115}
116
117define <32 x half> @stack_fold_fmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
118; CHECK-LABEL: stack_fold_fmadd213ph_mask:
119; CHECK:       # %bb.0:
120; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
121; CHECK-NEXT:    #APP
122; CHECK-NEXT:    nop
123; CHECK-NEXT:    #NO_APP
124; CHECK-NEXT:    vmovaps (%rdi), %zmm2
125; CHECK-NEXT:    kmovd %esi, %k1
126; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
127; CHECK-NEXT:    vmovaps %zmm2, %zmm0
128; CHECK-NEXT:    retq
129  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
130  %a0 = load <32 x half>, ptr %p
131  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
132  %3 = bitcast i32 %mask to <32 x i1>
133  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
134  ret <32 x half> %4
135}
136
137define <32 x half> @stack_fold_fmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
138; CHECK-LABEL: stack_fold_fmadd231ph_mask:
139; CHECK:       # %bb.0:
140; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
141; CHECK-NEXT:    #APP
142; CHECK-NEXT:    nop
143; CHECK-NEXT:    #NO_APP
144; CHECK-NEXT:    vmovaps (%rdi), %zmm2
145; CHECK-NEXT:    kmovd %esi, %k1
146; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
147; CHECK-NEXT:    vmovaps %zmm2, %zmm0
148; CHECK-NEXT:    retq
149  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
150  %a0 = load <32 x half>, ptr %p
151  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
152  %3 = bitcast i32 %mask to <32 x i1>
153  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
154  ret <32 x half> %4
155}
156
157define <32 x half> @stack_fold_fmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
158; CHECK-LABEL: stack_fold_fmadd321ph_mask:
159; CHECK:       # %bb.0:
160; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
161; CHECK-NEXT:    #APP
162; CHECK-NEXT:    nop
163; CHECK-NEXT:    #NO_APP
164; CHECK-NEXT:    vmovaps (%rdi), %zmm2
165; CHECK-NEXT:    kmovd %esi, %k1
166; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
167; CHECK-NEXT:    vmovaps %zmm2, %zmm0
168; CHECK-NEXT:    retq
169  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
170  %a0 = load <32 x half>, ptr %p
171  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
172  %3 = bitcast i32 %mask to <32 x i1>
173  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
174  ret <32 x half> %4
175}
176
177define <32 x half> @stack_fold_fmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
178; CHECK-LABEL: stack_fold_fmadd132ph_mask:
179; CHECK:       # %bb.0:
180; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
181; CHECK-NEXT:    #APP
182; CHECK-NEXT:    nop
183; CHECK-NEXT:    #NO_APP
184; CHECK-NEXT:    vmovaps (%rdi), %zmm2
185; CHECK-NEXT:    kmovd %esi, %k1
186; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
187; CHECK-NEXT:    vmovaps %zmm2, %zmm0
188; CHECK-NEXT:    retq
189  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
190  %a0 = load <32 x half>, ptr %p
191  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
192  %3 = bitcast i32 %mask to <32 x i1>
193  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
194  ret <32 x half> %4
195}
196
197define <32 x half> @stack_fold_fmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
198; CHECK-LABEL: stack_fold_fmadd312ph_mask:
199; CHECK:       # %bb.0:
200; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
201; CHECK-NEXT:    #APP
202; CHECK-NEXT:    nop
203; CHECK-NEXT:    #NO_APP
204; CHECK-NEXT:    vmovaps (%rdi), %zmm2
205; CHECK-NEXT:    kmovd %esi, %k1
206; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
207; CHECK-NEXT:    vmovaps %zmm2, %zmm0
208; CHECK-NEXT:    retq
209  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
210  %a0 = load <32 x half>, ptr %p
211  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
212  %3 = bitcast i32 %mask to <32 x i1>
213  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
214  ret <32 x half> %4
215}
216
217define <32 x half> @stack_fold_fmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
218; CHECK-LABEL: stack_fold_fmadd123ph_maskz:
219; CHECK:       # %bb.0:
220; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
221; CHECK-NEXT:    #APP
222; CHECK-NEXT:    nop
223; CHECK-NEXT:    #NO_APP
224; CHECK-NEXT:    kmovd (%rdi), %k1
225; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
226; CHECK-NEXT:    retq
227  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
228  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
229  %3 = load i32, ptr %mask
230  %4 = bitcast i32 %3 to <32 x i1>
231  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
232  ret <32 x half> %5
233}
234
235define <32 x half> @stack_fold_fmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
236; CHECK-LABEL: stack_fold_fmadd213ph_maskz:
237; CHECK:       # %bb.0:
238; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
239; CHECK-NEXT:    #APP
240; CHECK-NEXT:    nop
241; CHECK-NEXT:    #NO_APP
242; CHECK-NEXT:    kmovd (%rdi), %k1
243; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
244; CHECK-NEXT:    retq
245  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
246  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
247  %3 = load i32, ptr %mask
248  %4 = bitcast i32 %3 to <32 x i1>
249  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
250  ret <32 x half> %5
251}
252
253define <32 x half> @stack_fold_fmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
254; CHECK-LABEL: stack_fold_fmadd231ph_maskz:
255; CHECK:       # %bb.0:
256; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
257; CHECK-NEXT:    #APP
258; CHECK-NEXT:    nop
259; CHECK-NEXT:    #NO_APP
260; CHECK-NEXT:    kmovd (%rdi), %k1
261; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
262; CHECK-NEXT:    retq
263  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
264  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
265  %3 = load i32, ptr %mask
266  %4 = bitcast i32 %3 to <32 x i1>
267  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
268  ret <32 x half> %5
269}
270
271define <32 x half> @stack_fold_fmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
272; CHECK-LABEL: stack_fold_fmadd321ph_maskz:
273; CHECK:       # %bb.0:
274; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
275; CHECK-NEXT:    #APP
276; CHECK-NEXT:    nop
277; CHECK-NEXT:    #NO_APP
278; CHECK-NEXT:    kmovd (%rdi), %k1
279; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
280; CHECK-NEXT:    retq
281  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
282  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
283  %3 = load i32, ptr %mask
284  %4 = bitcast i32 %3 to <32 x i1>
285  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
286  ret <32 x half> %5
287}
288
289define <32 x half> @stack_fold_fmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
290; CHECK-LABEL: stack_fold_fmadd132ph_maskz:
291; CHECK:       # %bb.0:
292; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
293; CHECK-NEXT:    #APP
294; CHECK-NEXT:    nop
295; CHECK-NEXT:    #NO_APP
296; CHECK-NEXT:    kmovd (%rdi), %k1
297; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
298; CHECK-NEXT:    retq
299  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
300  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
301  %3 = load i32, ptr %mask
302  %4 = bitcast i32 %3 to <32 x i1>
303  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
304  ret <32 x half> %5
305}
306
307define <32 x half> @stack_fold_fmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
308; CHECK-LABEL: stack_fold_fmadd312ph_maskz:
309; CHECK:       # %bb.0:
310; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
311; CHECK-NEXT:    #APP
312; CHECK-NEXT:    nop
313; CHECK-NEXT:    #NO_APP
314; CHECK-NEXT:    kmovd (%rdi), %k1
315; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
316; CHECK-NEXT:    retq
317  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
318  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
319  %3 = load i32, ptr %mask
320  %4 = bitcast i32 %3 to <32 x i1>
321  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
322  ret <32 x half> %5
323}
324
325define <32 x half> @stack_fold_fmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
326; CHECK-LABEL: stack_fold_fmsub123ph:
327; CHECK:       # %bb.0:
328; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
329; CHECK-NEXT:    #APP
330; CHECK-NEXT:    nop
331; CHECK-NEXT:    #NO_APP
332; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
333; CHECK-NEXT:    retq
334  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
335  %2 = fneg <32 x half> %a2
336  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %2)
337  ret <32 x half> %3
338}
339
340define <32 x half> @stack_fold_fmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
341; CHECK-LABEL: stack_fold_fmsub213ph:
342; CHECK:       # %bb.0:
343; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
344; CHECK-NEXT:    #APP
345; CHECK-NEXT:    nop
346; CHECK-NEXT:    #NO_APP
347; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
348; CHECK-NEXT:    retq
349  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
350  %2 = fneg <32 x half> %a2
351  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %2)
352  ret <32 x half> %3
353}
354
355define <32 x half> @stack_fold_fmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
356; CHECK-LABEL: stack_fold_fmsub231ph:
357; CHECK:       # %bb.0:
358; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
359; CHECK-NEXT:    #APP
360; CHECK-NEXT:    nop
361; CHECK-NEXT:    #NO_APP
362; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
363; CHECK-NEXT:    retq
364  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
365  %2 = fneg <32 x half> %a0
366  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %2)
367  ret <32 x half> %3
368}
369
370define <32 x half> @stack_fold_fmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
371; CHECK-LABEL: stack_fold_fmsub321ph:
372; CHECK:       # %bb.0:
373; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
374; CHECK-NEXT:    #APP
375; CHECK-NEXT:    nop
376; CHECK-NEXT:    #NO_APP
377; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
378; CHECK-NEXT:    retq
379  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
380  %2 = fneg <32 x half> %a0
381  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %2)
382  ret <32 x half> %3
383}
384
385define <32 x half> @stack_fold_fmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
386; CHECK-LABEL: stack_fold_fmsub132ph:
387; CHECK:       # %bb.0:
388; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
389; CHECK-NEXT:    #APP
390; CHECK-NEXT:    nop
391; CHECK-NEXT:    #NO_APP
392; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
393; CHECK-NEXT:    retq
394  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
395  %2 = fneg <32 x half> %a1
396  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %2)
397  ret <32 x half> %3
398}
399
400define <32 x half> @stack_fold_fmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
401; CHECK-LABEL: stack_fold_fmsub312ph:
402; CHECK:       # %bb.0:
403; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
404; CHECK-NEXT:    #APP
405; CHECK-NEXT:    nop
406; CHECK-NEXT:    #NO_APP
407; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
408; CHECK-NEXT:    retq
409  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
410  %2 = fneg <32 x half> %a1
411  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %2)
412  ret <32 x half> %3
413}
414
415define <32 x half> @stack_fold_fmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
416; CHECK-LABEL: stack_fold_fmsub123ph_mask:
417; CHECK:       # %bb.0:
418; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
419; CHECK-NEXT:    #APP
420; CHECK-NEXT:    nop
421; CHECK-NEXT:    #NO_APP
422; CHECK-NEXT:    vmovaps (%rdi), %zmm2
423; CHECK-NEXT:    kmovd %esi, %k1
424; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
425; CHECK-NEXT:    vmovaps %zmm2, %zmm0
426; CHECK-NEXT:    retq
427  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
428  %a0 = load <32 x half>, ptr %p
429  %neg = fneg <32 x half> %a2
430  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg)
431  %3 = bitcast i32 %mask to <32 x i1>
432  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
433  ret <32 x half> %4
434}
435
436define <32 x half> @stack_fold_fmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
437; CHECK-LABEL: stack_fold_fmsub213ph_mask:
438; CHECK:       # %bb.0:
439; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
440; CHECK-NEXT:    #APP
441; CHECK-NEXT:    nop
442; CHECK-NEXT:    #NO_APP
443; CHECK-NEXT:    vmovaps (%rdi), %zmm2
444; CHECK-NEXT:    kmovd %esi, %k1
445; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
446; CHECK-NEXT:    vmovaps %zmm2, %zmm0
447; CHECK-NEXT:    retq
448  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
449  %a0 = load <32 x half>, ptr %p
450  %neg = fneg <32 x half> %a2
451  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg)
452  %3 = bitcast i32 %mask to <32 x i1>
453  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
454  ret <32 x half> %4
455}
456
457define <32 x half> @stack_fold_fmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
458; CHECK-LABEL: stack_fold_fmsub231ph_mask:
459; CHECK:       # %bb.0:
460; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
461; CHECK-NEXT:    #APP
462; CHECK-NEXT:    nop
463; CHECK-NEXT:    #NO_APP
464; CHECK-NEXT:    vmovaps (%rdi), %zmm2
465; CHECK-NEXT:    kmovd %esi, %k1
466; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
467; CHECK-NEXT:    vmovaps %zmm2, %zmm0
468; CHECK-NEXT:    retq
469  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
470  %a0 = load <32 x half>, ptr %p
471  %neg = fneg <32 x half> %a0
472  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg)
473  %3 = bitcast i32 %mask to <32 x i1>
474  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
475  ret <32 x half> %4
476}
477
478define <32 x half> @stack_fold_fmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
479; CHECK-LABEL: stack_fold_fmsub321ph_mask:
480; CHECK:       # %bb.0:
481; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
482; CHECK-NEXT:    #APP
483; CHECK-NEXT:    nop
484; CHECK-NEXT:    #NO_APP
485; CHECK-NEXT:    vmovaps (%rdi), %zmm2
486; CHECK-NEXT:    kmovd %esi, %k1
487; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
488; CHECK-NEXT:    vmovaps %zmm2, %zmm0
489; CHECK-NEXT:    retq
490  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
491  %a0 = load <32 x half>, ptr %p
492  %neg = fneg <32 x half> %a0
493  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg)
494  %3 = bitcast i32 %mask to <32 x i1>
495  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
496  ret <32 x half> %4
497}
498
499define <32 x half> @stack_fold_fmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
500; CHECK-LABEL: stack_fold_fmsub132ph_mask:
501; CHECK:       # %bb.0:
502; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
503; CHECK-NEXT:    #APP
504; CHECK-NEXT:    nop
505; CHECK-NEXT:    #NO_APP
506; CHECK-NEXT:    vmovaps (%rdi), %zmm2
507; CHECK-NEXT:    kmovd %esi, %k1
508; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
509; CHECK-NEXT:    vmovaps %zmm2, %zmm0
510; CHECK-NEXT:    retq
511  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
512  %a0 = load <32 x half>, ptr %p
513  %neg = fneg <32 x half> %a1
514  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg)
515  %3 = bitcast i32 %mask to <32 x i1>
516  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
517  ret <32 x half> %4
518}
519
520define <32 x half> @stack_fold_fmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
521; CHECK-LABEL: stack_fold_fmsub312ph_mask:
522; CHECK:       # %bb.0:
523; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
524; CHECK-NEXT:    #APP
525; CHECK-NEXT:    nop
526; CHECK-NEXT:    #NO_APP
527; CHECK-NEXT:    vmovaps (%rdi), %zmm2
528; CHECK-NEXT:    kmovd %esi, %k1
529; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
530; CHECK-NEXT:    vmovaps %zmm2, %zmm0
531; CHECK-NEXT:    retq
532  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
533  %a0 = load <32 x half>, ptr %p
534  %neg = fneg <32 x half> %a1
535  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg)
536  %3 = bitcast i32 %mask to <32 x i1>
537  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
538  ret <32 x half> %4
539}
540
541define <32 x half> @stack_fold_fmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
542; CHECK-LABEL: stack_fold_fmsub123ph_maskz:
543; CHECK:       # %bb.0:
544; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
545; CHECK-NEXT:    #APP
546; CHECK-NEXT:    nop
547; CHECK-NEXT:    #NO_APP
548; CHECK-NEXT:    kmovd (%rdi), %k1
549; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
550; CHECK-NEXT:    retq
551  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
552  %neg = fneg <32 x half> %a2
553  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg)
554  %3 = load i32, ptr %mask
555  %4 = bitcast i32 %3 to <32 x i1>
556  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
557  ret <32 x half> %5
558}
559
560define <32 x half> @stack_fold_fmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
561; CHECK-LABEL: stack_fold_fmsub213ph_maskz:
562; CHECK:       # %bb.0:
563; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
564; CHECK-NEXT:    #APP
565; CHECK-NEXT:    nop
566; CHECK-NEXT:    #NO_APP
567; CHECK-NEXT:    kmovd (%rdi), %k1
568; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
569; CHECK-NEXT:    retq
570  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
571  %neg = fneg <32 x half> %a2
572  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg)
573  %3 = load i32, ptr %mask
574  %4 = bitcast i32 %3 to <32 x i1>
575  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
576  ret <32 x half> %5
577}
578
579define <32 x half> @stack_fold_fmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
580; CHECK-LABEL: stack_fold_fmsub231ph_maskz:
581; CHECK:       # %bb.0:
582; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
583; CHECK-NEXT:    #APP
584; CHECK-NEXT:    nop
585; CHECK-NEXT:    #NO_APP
586; CHECK-NEXT:    kmovd (%rdi), %k1
587; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
588; CHECK-NEXT:    retq
589  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
590  %neg = fneg <32 x half> %a0
591  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg)
592  %3 = load i32, ptr %mask
593  %4 = bitcast i32 %3 to <32 x i1>
594  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
595  ret <32 x half> %5
596}
597
598define <32 x half> @stack_fold_fmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
599; CHECK-LABEL: stack_fold_fmsub321ph_maskz:
600; CHECK:       # %bb.0:
601; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
602; CHECK-NEXT:    #APP
603; CHECK-NEXT:    nop
604; CHECK-NEXT:    #NO_APP
605; CHECK-NEXT:    kmovd (%rdi), %k1
606; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
607; CHECK-NEXT:    retq
608  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
609  %neg = fneg <32 x half> %a0
610  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg)
611  %3 = load i32, ptr %mask
612  %4 = bitcast i32 %3 to <32 x i1>
613  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
614  ret <32 x half> %5
615}
616
617define <32 x half> @stack_fold_fmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
618; CHECK-LABEL: stack_fold_fmsub132ph_maskz:
619; CHECK:       # %bb.0:
620; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
621; CHECK-NEXT:    #APP
622; CHECK-NEXT:    nop
623; CHECK-NEXT:    #NO_APP
624; CHECK-NEXT:    kmovd (%rdi), %k1
625; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
626; CHECK-NEXT:    retq
627  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
628  %neg = fneg <32 x half> %a1
629  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg)
630  %3 = load i32, ptr %mask
631  %4 = bitcast i32 %3 to <32 x i1>
632  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
633  ret <32 x half> %5
634}
635
636define <32 x half> @stack_fold_fmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
637; CHECK-LABEL: stack_fold_fmsub312ph_maskz:
638; CHECK:       # %bb.0:
639; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
640; CHECK-NEXT:    #APP
641; CHECK-NEXT:    nop
642; CHECK-NEXT:    #NO_APP
643; CHECK-NEXT:    kmovd (%rdi), %k1
644; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
645; CHECK-NEXT:    retq
646  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
647  %neg = fneg <32 x half> %a1
648  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg)
649  %3 = load i32, ptr %mask
650  %4 = bitcast i32 %3 to <32 x i1>
651  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
652  ret <32 x half> %5
653}
654
655define <32 x half> @stack_fold_fnmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
656; CHECK-LABEL: stack_fold_fnmadd123ph:
657; CHECK:       # %bb.0:
658; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
659; CHECK-NEXT:    #APP
660; CHECK-NEXT:    nop
661; CHECK-NEXT:    #NO_APP
662; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
663; CHECK-NEXT:    retq
664  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
665  %2 = fneg <32 x half> %a0
666  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a2)
667  ret <32 x half> %3
668}
669
670define <32 x half> @stack_fold_fnmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
671; CHECK-LABEL: stack_fold_fnmadd213ph:
672; CHECK:       # %bb.0:
673; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
674; CHECK-NEXT:    #APP
675; CHECK-NEXT:    nop
676; CHECK-NEXT:    #NO_APP
677; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
678; CHECK-NEXT:    retq
679  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
680  %2 = fneg <32 x half> %a1
681  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a2)
682  ret <32 x half> %3
683}
684
685define <32 x half> @stack_fold_fnmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
686; CHECK-LABEL: stack_fold_fnmadd231ph:
687; CHECK:       # %bb.0:
688; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
689; CHECK-NEXT:    #APP
690; CHECK-NEXT:    nop
691; CHECK-NEXT:    #NO_APP
692; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
693; CHECK-NEXT:    retq
694  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
695  %2 = fneg <32 x half> %a1
696  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a0)
697  ret <32 x half> %3
698}
699
700define <32 x half> @stack_fold_fnmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
701; CHECK-LABEL: stack_fold_fnmadd321ph:
702; CHECK:       # %bb.0:
703; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
704; CHECK-NEXT:    #APP
705; CHECK-NEXT:    nop
706; CHECK-NEXT:    #NO_APP
707; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
708; CHECK-NEXT:    retq
709  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
710  %2 = fneg <32 x half> %a2
711  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a0)
712  ret <32 x half> %3
713}
714
715define <32 x half> @stack_fold_fnmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
716; CHECK-LABEL: stack_fold_fnmadd132ph:
717; CHECK:       # %bb.0:
718; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
719; CHECK-NEXT:    #APP
720; CHECK-NEXT:    nop
721; CHECK-NEXT:    #NO_APP
722; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
723; CHECK-NEXT:    retq
724  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
725  %2 = fneg <32 x half> %a0
726  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a1)
727  ret <32 x half> %3
728}
729
730define <32 x half> @stack_fold_fnmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
731; CHECK-LABEL: stack_fold_fnmadd312ph:
732; CHECK:       # %bb.0:
733; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
734; CHECK-NEXT:    #APP
735; CHECK-NEXT:    nop
736; CHECK-NEXT:    #NO_APP
737; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
738; CHECK-NEXT:    retq
739  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
740  %2 = fneg <32 x half> %a2
741  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a1)
742  ret <32 x half> %3
743}
744
745define <32 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
746; CHECK-LABEL: stack_fold_fnmadd123ph_mask:
747; CHECK:       # %bb.0:
748; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
749; CHECK-NEXT:    #APP
750; CHECK-NEXT:    nop
751; CHECK-NEXT:    #NO_APP
752; CHECK-NEXT:    vmovaps (%rdi), %zmm2
753; CHECK-NEXT:    kmovd %esi, %k1
754; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
755; CHECK-NEXT:    vmovaps %zmm2, %zmm0
756; CHECK-NEXT:    retq
757  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
758  %a0 = load <32 x half>, ptr %p
759  %neg = fneg <32 x half> %a0
760  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2)
761  %3 = bitcast i32 %mask to <32 x i1>
762  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
763  ret <32 x half> %4
764}
765
766define <32 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
767; CHECK-LABEL: stack_fold_fnmadd213ph_mask:
768; CHECK:       # %bb.0:
769; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
770; CHECK-NEXT:    #APP
771; CHECK-NEXT:    nop
772; CHECK-NEXT:    #NO_APP
773; CHECK-NEXT:    vmovaps (%rdi), %zmm2
774; CHECK-NEXT:    kmovd %esi, %k1
775; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
776; CHECK-NEXT:    vmovaps %zmm2, %zmm0
777; CHECK-NEXT:    retq
778  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
779  %a0 = load <32 x half>, ptr %p
780  %neg = fneg <32 x half> %a1
781  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2)
782  %3 = bitcast i32 %mask to <32 x i1>
783  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
784  ret <32 x half> %4
785}
786
787define <32 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
788; CHECK-LABEL: stack_fold_fnmadd231ph_mask:
789; CHECK:       # %bb.0:
790; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
791; CHECK-NEXT:    #APP
792; CHECK-NEXT:    nop
793; CHECK-NEXT:    #NO_APP
794; CHECK-NEXT:    vmovaps (%rdi), %zmm2
795; CHECK-NEXT:    kmovd %esi, %k1
796; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
797; CHECK-NEXT:    vmovaps %zmm2, %zmm0
798; CHECK-NEXT:    retq
799  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
800  %a0 = load <32 x half>, ptr %p
801  %neg = fneg <32 x half> %a1
802  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0)
803  %3 = bitcast i32 %mask to <32 x i1>
804  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
805  ret <32 x half> %4
806}
807
808define <32 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
809; CHECK-LABEL: stack_fold_fnmadd321ph_mask:
810; CHECK:       # %bb.0:
811; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
812; CHECK-NEXT:    #APP
813; CHECK-NEXT:    nop
814; CHECK-NEXT:    #NO_APP
815; CHECK-NEXT:    vmovaps (%rdi), %zmm2
816; CHECK-NEXT:    kmovd %esi, %k1
817; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
818; CHECK-NEXT:    vmovaps %zmm2, %zmm0
819; CHECK-NEXT:    retq
820  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
821  %a0 = load <32 x half>, ptr %p
822  %neg = fneg <32 x half> %a2
823  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0)
824  %3 = bitcast i32 %mask to <32 x i1>
825  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
826  ret <32 x half> %4
827}
828
829define <32 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
830; CHECK-LABEL: stack_fold_fnmadd132ph_mask:
831; CHECK:       # %bb.0:
832; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
833; CHECK-NEXT:    #APP
834; CHECK-NEXT:    nop
835; CHECK-NEXT:    #NO_APP
836; CHECK-NEXT:    vmovaps (%rdi), %zmm2
837; CHECK-NEXT:    kmovd %esi, %k1
838; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
839; CHECK-NEXT:    vmovaps %zmm2, %zmm0
840; CHECK-NEXT:    retq
841  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
842  %a0 = load <32 x half>, ptr %p
843  %neg = fneg <32 x half> %a0
844  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1)
845  %3 = bitcast i32 %mask to <32 x i1>
846  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
847  ret <32 x half> %4
848}
849
850define <32 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
851; CHECK-LABEL: stack_fold_fnmadd312ph_mask:
852; CHECK:       # %bb.0:
853; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
854; CHECK-NEXT:    #APP
855; CHECK-NEXT:    nop
856; CHECK-NEXT:    #NO_APP
857; CHECK-NEXT:    vmovaps (%rdi), %zmm2
858; CHECK-NEXT:    kmovd %esi, %k1
859; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
860; CHECK-NEXT:    vmovaps %zmm2, %zmm0
861; CHECK-NEXT:    retq
862  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
863  %a0 = load <32 x half>, ptr %p
864  %neg = fneg <32 x half> %a2
865  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1)
866  %3 = bitcast i32 %mask to <32 x i1>
867  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
868  ret <32 x half> %4
869}
870
871define <32 x half> @stack_fold_fnmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
872; CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
873; CHECK:       # %bb.0:
874; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
875; CHECK-NEXT:    #APP
876; CHECK-NEXT:    nop
877; CHECK-NEXT:    #NO_APP
878; CHECK-NEXT:    kmovd (%rdi), %k1
879; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
880; CHECK-NEXT:    retq
881  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
882  %neg = fneg <32 x half> %a0
883  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2)
884  %3 = load i32, ptr %mask
885  %4 = bitcast i32 %3 to <32 x i1>
886  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
887  ret <32 x half> %5
888}
889
890define <32 x half> @stack_fold_fnmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
891; CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
892; CHECK:       # %bb.0:
893; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
894; CHECK-NEXT:    #APP
895; CHECK-NEXT:    nop
896; CHECK-NEXT:    #NO_APP
897; CHECK-NEXT:    kmovd (%rdi), %k1
898; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
899; CHECK-NEXT:    retq
900  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
901  %neg = fneg <32 x half> %a1
902  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2)
903  %3 = load i32, ptr %mask
904  %4 = bitcast i32 %3 to <32 x i1>
905  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
906  ret <32 x half> %5
907}
908
909define <32 x half> @stack_fold_fnmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
910; CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
911; CHECK:       # %bb.0:
912; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
913; CHECK-NEXT:    #APP
914; CHECK-NEXT:    nop
915; CHECK-NEXT:    #NO_APP
916; CHECK-NEXT:    kmovd (%rdi), %k1
917; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
918; CHECK-NEXT:    retq
919  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
920  %neg = fneg <32 x half> %a1
921  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0)
922  %3 = load i32, ptr %mask
923  %4 = bitcast i32 %3 to <32 x i1>
924  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
925  ret <32 x half> %5
926}
927
928define <32 x half> @stack_fold_fnmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
929; CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
930; CHECK:       # %bb.0:
931; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
932; CHECK-NEXT:    #APP
933; CHECK-NEXT:    nop
934; CHECK-NEXT:    #NO_APP
935; CHECK-NEXT:    kmovd (%rdi), %k1
936; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
937; CHECK-NEXT:    retq
938  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
939  %neg = fneg <32 x half> %a2
940  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0)
941  %3 = load i32, ptr %mask
942  %4 = bitcast i32 %3 to <32 x i1>
943  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
944  ret <32 x half> %5
945}
946
947define <32 x half> @stack_fold_fnmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
948; CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
949; CHECK:       # %bb.0:
950; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
951; CHECK-NEXT:    #APP
952; CHECK-NEXT:    nop
953; CHECK-NEXT:    #NO_APP
954; CHECK-NEXT:    kmovd (%rdi), %k1
955; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
956; CHECK-NEXT:    retq
957  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
958  %neg = fneg <32 x half> %a0
959  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1)
960  %3 = load i32, ptr %mask
961  %4 = bitcast i32 %3 to <32 x i1>
962  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
963  ret <32 x half> %5
964}
965
966define <32 x half> @stack_fold_fnmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
967; CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
968; CHECK:       # %bb.0:
969; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
970; CHECK-NEXT:    #APP
971; CHECK-NEXT:    nop
972; CHECK-NEXT:    #NO_APP
973; CHECK-NEXT:    kmovd (%rdi), %k1
974; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
975; CHECK-NEXT:    retq
976  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
977  %neg = fneg <32 x half> %a2
978  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1)
979  %3 = load i32, ptr %mask
980  %4 = bitcast i32 %3 to <32 x i1>
981  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
982  ret <32 x half> %5
983}
984
985define <32 x half> @stack_fold_fnmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
986; CHECK-LABEL: stack_fold_fnmsub123ph:
987; CHECK:       # %bb.0:
988; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
989; CHECK-NEXT:    #APP
990; CHECK-NEXT:    nop
991; CHECK-NEXT:    #NO_APP
992; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
993; CHECK-NEXT:    retq
994  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
995  %2 = fneg <32 x half> %a0
996  %3 = fneg <32 x half> %a2
997  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3)
998  ret <32 x half> %4
999}
1000
1001define <32 x half> @stack_fold_fnmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1002; CHECK-LABEL: stack_fold_fnmsub213ph:
1003; CHECK:       # %bb.0:
1004; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1005; CHECK-NEXT:    #APP
1006; CHECK-NEXT:    nop
1007; CHECK-NEXT:    #NO_APP
1008; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1009; CHECK-NEXT:    retq
1010  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1011  %2 = fneg <32 x half> %a1
1012  %3 = fneg <32 x half> %a2
1013  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3)
1014  ret <32 x half> %4
1015}
1016
1017define <32 x half> @stack_fold_fnmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1018; CHECK-LABEL: stack_fold_fnmsub231ph:
1019; CHECK:       # %bb.0:
1020; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1021; CHECK-NEXT:    #APP
1022; CHECK-NEXT:    nop
1023; CHECK-NEXT:    #NO_APP
1024; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1025; CHECK-NEXT:    retq
1026  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1027  %2 = fneg <32 x half> %a1
1028  %3 = fneg <32 x half> %a0
1029  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3)
1030  ret <32 x half> %4
1031}
1032
1033define <32 x half> @stack_fold_fnmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1034; CHECK-LABEL: stack_fold_fnmsub321ph:
1035; CHECK:       # %bb.0:
1036; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1037; CHECK-NEXT:    #APP
1038; CHECK-NEXT:    nop
1039; CHECK-NEXT:    #NO_APP
1040; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1041; CHECK-NEXT:    retq
1042  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1043  %2 = fneg <32 x half> %a2
1044  %3 = fneg <32 x half> %a0
1045  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3)
1046  ret <32 x half> %4
1047}
1048
1049define <32 x half> @stack_fold_fnmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1050; CHECK-LABEL: stack_fold_fnmsub132ph:
1051; CHECK:       # %bb.0:
1052; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1053; CHECK-NEXT:    #APP
1054; CHECK-NEXT:    nop
1055; CHECK-NEXT:    #NO_APP
1056; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1057; CHECK-NEXT:    retq
1058  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1059  %2 = fneg <32 x half> %a0
1060  %3 = fneg <32 x half> %a1
1061  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3)
1062  ret <32 x half> %4
1063}
1064
1065define <32 x half> @stack_fold_fnmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1066; CHECK-LABEL: stack_fold_fnmsub312ph:
1067; CHECK:       # %bb.0:
1068; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1069; CHECK-NEXT:    #APP
1070; CHECK-NEXT:    nop
1071; CHECK-NEXT:    #NO_APP
1072; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1073; CHECK-NEXT:    retq
1074  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1075  %2 = fneg <32 x half> %a2
1076  %3 = fneg <32 x half> %a1
1077  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3)
1078  ret <32 x half> %4
1079}
1080
1081define <32 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1082; CHECK-LABEL: stack_fold_fnmsub123ph_mask:
1083; CHECK:       # %bb.0:
1084; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1085; CHECK-NEXT:    #APP
1086; CHECK-NEXT:    nop
1087; CHECK-NEXT:    #NO_APP
1088; CHECK-NEXT:    vmovaps (%rdi), %zmm2
1089; CHECK-NEXT:    kmovd %esi, %k1
1090; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1091; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1092; CHECK-NEXT:    retq
1093  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1094  %a0 = load <32 x half>, ptr %p
1095  %neg = fneg <32 x half> %a2
1096  %neg1 = fneg <32 x half> %a0
1097  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
1098  %3 = bitcast i32 %mask to <32 x i1>
1099  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1100  ret <32 x half> %4
1101}
1102
1103define <32 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1104; CHECK-LABEL: stack_fold_fnmsub213ph_mask:
1105; CHECK:       # %bb.0:
1106; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1107; CHECK-NEXT:    #APP
1108; CHECK-NEXT:    nop
1109; CHECK-NEXT:    #NO_APP
1110; CHECK-NEXT:    vmovaps (%rdi), %zmm2
1111; CHECK-NEXT:    kmovd %esi, %k1
1112; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1113; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1114; CHECK-NEXT:    retq
1115  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1116  %a0 = load <32 x half>, ptr %p
1117  %neg = fneg <32 x half> %a2
1118  %neg1 = fneg <32 x half> %a1
1119  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
1120  %3 = bitcast i32 %mask to <32 x i1>
1121  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1122  ret <32 x half> %4
1123}
1124
1125define <32 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1126; CHECK-LABEL: stack_fold_fnmsub231ph_mask:
1127; CHECK:       # %bb.0:
1128; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1129; CHECK-NEXT:    #APP
1130; CHECK-NEXT:    nop
1131; CHECK-NEXT:    #NO_APP
1132; CHECK-NEXT:    vmovaps (%rdi), %zmm2
1133; CHECK-NEXT:    kmovd %esi, %k1
1134; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1135; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1136; CHECK-NEXT:    retq
1137  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1138  %a0 = load <32 x half>, ptr %p
1139  %neg = fneg <32 x half> %a0
1140  %neg1 = fneg <32 x half> %a1
1141  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
1142  %3 = bitcast i32 %mask to <32 x i1>
1143  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1144  ret <32 x half> %4
1145}
1146
1147define <32 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1148; CHECK-LABEL: stack_fold_fnmsub321ph_mask:
1149; CHECK:       # %bb.0:
1150; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1151; CHECK-NEXT:    #APP
1152; CHECK-NEXT:    nop
1153; CHECK-NEXT:    #NO_APP
1154; CHECK-NEXT:    vmovaps (%rdi), %zmm2
1155; CHECK-NEXT:    kmovd %esi, %k1
1156; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1157; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1158; CHECK-NEXT:    retq
1159  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1160  %a0 = load <32 x half>, ptr %p
1161  %neg = fneg <32 x half> %a0
1162  %neg1 = fneg <32 x half> %a2
1163  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
1164  %3 = bitcast i32 %mask to <32 x i1>
1165  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1166  ret <32 x half> %4
1167}
1168
1169define <32 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1170; CHECK-LABEL: stack_fold_fnmsub132ph_mask:
1171; CHECK:       # %bb.0:
1172; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1173; CHECK-NEXT:    #APP
1174; CHECK-NEXT:    nop
1175; CHECK-NEXT:    #NO_APP
1176; CHECK-NEXT:    vmovaps (%rdi), %zmm2
1177; CHECK-NEXT:    kmovd %esi, %k1
1178; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1179; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1180; CHECK-NEXT:    retq
1181  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1182  %a0 = load <32 x half>, ptr %p
1183  %neg = fneg <32 x half> %a1
1184  %neg1 = fneg <32 x half> %a0
1185  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
1186  %3 = bitcast i32 %mask to <32 x i1>
1187  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1188  ret <32 x half> %4
1189}
1190
1191define <32 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1192; CHECK-LABEL: stack_fold_fnmsub312ph_mask:
1193; CHECK:       # %bb.0:
1194; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1195; CHECK-NEXT:    #APP
1196; CHECK-NEXT:    nop
1197; CHECK-NEXT:    #NO_APP
1198; CHECK-NEXT:    vmovaps (%rdi), %zmm2
1199; CHECK-NEXT:    kmovd %esi, %k1
1200; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1201; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1202; CHECK-NEXT:    retq
1203  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1204  %a0 = load <32 x half>, ptr %p
1205  %neg = fneg <32 x half> %a1
1206  %neg1 = fneg <32 x half> %a2
1207  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
1208  %3 = bitcast i32 %mask to <32 x i1>
1209  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1210  ret <32 x half> %4
1211}
1212
1213define <32 x half> @stack_fold_fnmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1214; CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
1215; CHECK:       # %bb.0:
1216; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1217; CHECK-NEXT:    #APP
1218; CHECK-NEXT:    nop
1219; CHECK-NEXT:    #NO_APP
1220; CHECK-NEXT:    kmovd (%rdi), %k1
1221; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1222; CHECK-NEXT:    retq
1223  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1224  %neg = fneg <32 x half> %a2
1225  %neg1 = fneg <32 x half> %a0
1226  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
1227  %3 = load i32, ptr %mask
1228  %4 = bitcast i32 %3 to <32 x i1>
1229  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1230  ret <32 x half> %5
1231}
1232
1233define <32 x half> @stack_fold_fnmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1234; CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
1235; CHECK:       # %bb.0:
1236; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1237; CHECK-NEXT:    #APP
1238; CHECK-NEXT:    nop
1239; CHECK-NEXT:    #NO_APP
1240; CHECK-NEXT:    kmovd (%rdi), %k1
1241; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1242; CHECK-NEXT:    retq
1243  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1244  %neg = fneg <32 x half> %a2
1245  %neg1 = fneg <32 x half> %a1
1246  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
1247  %3 = load i32, ptr %mask
1248  %4 = bitcast i32 %3 to <32 x i1>
1249  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1250  ret <32 x half> %5
1251}
1252
1253define <32 x half> @stack_fold_fnmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1254; CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
1255; CHECK:       # %bb.0:
1256; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1257; CHECK-NEXT:    #APP
1258; CHECK-NEXT:    nop
1259; CHECK-NEXT:    #NO_APP
1260; CHECK-NEXT:    kmovd (%rdi), %k1
1261; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1262; CHECK-NEXT:    retq
1263  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1264  %neg = fneg <32 x half> %a0
1265  %neg1 = fneg <32 x half> %a1
1266  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
1267  %3 = load i32, ptr %mask
1268  %4 = bitcast i32 %3 to <32 x i1>
1269  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1270  ret <32 x half> %5
1271}
1272
1273define <32 x half> @stack_fold_fnmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1274; CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
1275; CHECK:       # %bb.0:
1276; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1277; CHECK-NEXT:    #APP
1278; CHECK-NEXT:    nop
1279; CHECK-NEXT:    #NO_APP
1280; CHECK-NEXT:    kmovd (%rdi), %k1
1281; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1282; CHECK-NEXT:    retq
1283  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1284  %neg = fneg <32 x half> %a0
1285  %neg1 = fneg <32 x half> %a2
1286  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
1287  %3 = load i32, ptr %mask
1288  %4 = bitcast i32 %3 to <32 x i1>
1289  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1290  ret <32 x half> %5
1291}
1292
1293define <32 x half> @stack_fold_fnmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1294; CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
1295; CHECK:       # %bb.0:
1296; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1297; CHECK-NEXT:    #APP
1298; CHECK-NEXT:    nop
1299; CHECK-NEXT:    #NO_APP
1300; CHECK-NEXT:    kmovd (%rdi), %k1
1301; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1302; CHECK-NEXT:    retq
1303  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1304  %neg = fneg <32 x half> %a1
1305  %neg1 = fneg <32 x half> %a0
1306  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
1307  %3 = load i32, ptr %mask
1308  %4 = bitcast i32 %3 to <32 x i1>
1309  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1310  ret <32 x half> %5
1311}
1312
1313define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1314; CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
1315; CHECK:       # %bb.0:
1316; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1317; CHECK-NEXT:    #APP
1318; CHECK-NEXT:    nop
1319; CHECK-NEXT:    #NO_APP
1320; CHECK-NEXT:    kmovd (%rdi), %k1
1321; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1322; CHECK-NEXT:    retq
1323  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1324  %neg = fneg <32 x half> %a1
1325  %neg1 = fneg <32 x half> %a2
1326  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
1327  %3 = load i32, ptr %mask
1328  %4 = bitcast i32 %3 to <32 x i1>
1329  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1330  ret <32 x half> %5
1331}
1332
1333define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) {
1334; CHECK-LABEL: stack_fold_fmadd123sh:
1335; CHECK:       # %bb.0:
1336; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1337; CHECK-NEXT:    #APP
1338; CHECK-NEXT:    nop
1339; CHECK-NEXT:    #NO_APP
1340; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1341; CHECK-NEXT:    retq
1342  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1343  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
1344  ret half %2
1345}
1346declare half @llvm.fma.f16(half, half, half)
1347
1348define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) {
1349; CHECK-LABEL: stack_fold_fmadd213sh:
1350; CHECK:       # %bb.0:
1351; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1352; CHECK-NEXT:    #APP
1353; CHECK-NEXT:    nop
1354; CHECK-NEXT:    #NO_APP
1355; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1356; CHECK-NEXT:    retq
1357  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1358  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
1359  ret half %2
1360}
1361
1362define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) {
1363; CHECK-LABEL: stack_fold_fmadd231sh:
1364; CHECK:       # %bb.0:
1365; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1366; CHECK-NEXT:    #APP
1367; CHECK-NEXT:    nop
1368; CHECK-NEXT:    #NO_APP
1369; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1370; CHECK-NEXT:    retq
1371  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1372  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
1373  ret half %2
1374}
1375
1376define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) {
1377; CHECK-LABEL: stack_fold_fmadd321sh:
1378; CHECK:       # %bb.0:
1379; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1380; CHECK-NEXT:    #APP
1381; CHECK-NEXT:    nop
1382; CHECK-NEXT:    #NO_APP
1383; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1384; CHECK-NEXT:    retq
1385  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1386  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
1387  ret half %2
1388}
1389
1390define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) {
1391; CHECK-LABEL: stack_fold_fmadd132sh:
1392; CHECK:       # %bb.0:
1393; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1394; CHECK-NEXT:    #APP
1395; CHECK-NEXT:    nop
1396; CHECK-NEXT:    #NO_APP
1397; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1398; CHECK-NEXT:    retq
1399  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1400  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
1401  ret half %2
1402}
1403
1404define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) {
1405; CHECK-LABEL: stack_fold_fmadd312sh:
1406; CHECK:       # %bb.0:
1407; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1408; CHECK-NEXT:    #APP
1409; CHECK-NEXT:    nop
1410; CHECK-NEXT:    #NO_APP
1411; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1412; CHECK-NEXT:    retq
1413  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1414  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
1415  ret half %2
1416}
1417
1418define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
1419; CHECK-LABEL: stack_fold_fmsub123sh:
1420; CHECK:       # %bb.0:
1421; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1422; CHECK-NEXT:    #APP
1423; CHECK-NEXT:    nop
1424; CHECK-NEXT:    #NO_APP
1425; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1426; CHECK-NEXT:    retq
1427  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1428  %2 = fneg half %a2
1429  %3 = call half @llvm.fma.f16(half %a0, half %a1, half %2)
1430  ret half %3
1431}
1432
1433define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
1434; CHECK-LABEL: stack_fold_fmsub213sh:
1435; CHECK:       # %bb.0:
1436; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1437; CHECK-NEXT:    #APP
1438; CHECK-NEXT:    nop
1439; CHECK-NEXT:    #NO_APP
1440; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1441; CHECK-NEXT:    retq
1442  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1443  %2 = fneg half %a2
1444  %3 = call half @llvm.fma.f16(half %a1, half %a0, half %2)
1445  ret half %3
1446}
1447
1448define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
1449; CHECK-LABEL: stack_fold_fmsub231sh:
1450; CHECK:       # %bb.0:
1451; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1452; CHECK-NEXT:    #APP
1453; CHECK-NEXT:    nop
1454; CHECK-NEXT:    #NO_APP
1455; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1456; CHECK-NEXT:    retq
1457  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1458  %2 = fneg half %a0
1459  %3 = call half @llvm.fma.f16(half %a1, half %a2, half %2)
1460  ret half %3
1461}
1462
1463define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
1464; CHECK-LABEL: stack_fold_fmsub321sh:
1465; CHECK:       # %bb.0:
1466; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1467; CHECK-NEXT:    #APP
1468; CHECK-NEXT:    nop
1469; CHECK-NEXT:    #NO_APP
1470; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1471; CHECK-NEXT:    retq
1472  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1473  %2 = fneg half %a0
1474  %3 = call half @llvm.fma.f16(half %a2, half %a1, half %2)
1475  ret half %3
1476}
1477
1478define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
1479; CHECK-LABEL: stack_fold_fmsub132sh:
1480; CHECK:       # %bb.0:
1481; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1482; CHECK-NEXT:    #APP
1483; CHECK-NEXT:    nop
1484; CHECK-NEXT:    #NO_APP
1485; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1486; CHECK-NEXT:    retq
1487  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1488  %2 = fneg half %a1
1489  %3 = call half @llvm.fma.f16(half %a0, half %a2, half %2)
1490  ret half %3
1491}
1492
1493define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
1494; CHECK-LABEL: stack_fold_fmsub312sh:
1495; CHECK:       # %bb.0:
1496; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1497; CHECK-NEXT:    #APP
1498; CHECK-NEXT:    nop
1499; CHECK-NEXT:    #NO_APP
1500; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1501; CHECK-NEXT:    retq
1502  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1503  %2 = fneg half %a1
1504  %3 = call half @llvm.fma.f16(half %a2, half %a0, half %2)
1505  ret half %3
1506}
1507
1508define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
1509; CHECK-LABEL: stack_fold_fnmadd123sh:
1510; CHECK:       # %bb.0:
1511; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1512; CHECK-NEXT:    #APP
1513; CHECK-NEXT:    nop
1514; CHECK-NEXT:    #NO_APP
1515; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1516; CHECK-NEXT:    retq
1517  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1518  %2 = fneg half %a0
1519  %3 = call half @llvm.fma.f16(half %2, half %a1, half %a2)
1520  ret half %3
1521}
1522
1523define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
1524; CHECK-LABEL: stack_fold_fnmadd213sh:
1525; CHECK:       # %bb.0:
1526; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1527; CHECK-NEXT:    #APP
1528; CHECK-NEXT:    nop
1529; CHECK-NEXT:    #NO_APP
1530; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1531; CHECK-NEXT:    retq
1532  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1533  %2 = fneg half %a1
1534  %3 = call half @llvm.fma.f16(half %2, half %a0, half %a2)
1535  ret half %3
1536}
1537
1538define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
1539; CHECK-LABEL: stack_fold_fnmadd231sh:
1540; CHECK:       # %bb.0:
1541; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1542; CHECK-NEXT:    #APP
1543; CHECK-NEXT:    nop
1544; CHECK-NEXT:    #NO_APP
1545; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1546; CHECK-NEXT:    retq
1547  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1548  %2 = fneg half %a1
1549  %3 = call half @llvm.fma.f16(half %2, half %a2, half %a0)
1550  ret half %3
1551}
1552
1553define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
1554; CHECK-LABEL: stack_fold_fnmadd321sh:
1555; CHECK:       # %bb.0:
1556; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1557; CHECK-NEXT:    #APP
1558; CHECK-NEXT:    nop
1559; CHECK-NEXT:    #NO_APP
1560; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1561; CHECK-NEXT:    retq
1562  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1563  %2 = fneg half %a2
1564  %3 = call half @llvm.fma.f16(half %2, half %a1, half %a0)
1565  ret half %3
1566}
1567
1568define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
1569; CHECK-LABEL: stack_fold_fnmadd132sh:
1570; CHECK:       # %bb.0:
1571; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1572; CHECK-NEXT:    #APP
1573; CHECK-NEXT:    nop
1574; CHECK-NEXT:    #NO_APP
1575; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1576; CHECK-NEXT:    retq
1577  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1578  %2 = fneg half %a0
1579  %3 = call half @llvm.fma.f16(half %2, half %a2, half %a1)
1580  ret half %3
1581}
1582
1583define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
1584; CHECK-LABEL: stack_fold_fnmadd312sh:
1585; CHECK:       # %bb.0:
1586; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1587; CHECK-NEXT:    #APP
1588; CHECK-NEXT:    nop
1589; CHECK-NEXT:    #NO_APP
1590; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1591; CHECK-NEXT:    retq
1592  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1593  %2 = fneg half %a2
1594  %3 = call half @llvm.fma.f16(half %2, half %a0, half %a1)
1595  ret half %3
1596}
1597
1598define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
1599; CHECK-LABEL: stack_fold_fnmsub123sh:
1600; CHECK:       # %bb.0:
1601; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1602; CHECK-NEXT:    #APP
1603; CHECK-NEXT:    nop
1604; CHECK-NEXT:    #NO_APP
1605; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1606; CHECK-NEXT:    retq
1607  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1608  %2 = fneg half %a0
1609  %3 = fneg half %a2
1610  %4 = call half @llvm.fma.f16(half %2, half %a1, half %3)
1611  ret half %4
1612}
1613
1614define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
1615; CHECK-LABEL: stack_fold_fnmsub213sh:
1616; CHECK:       # %bb.0:
1617; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1618; CHECK-NEXT:    #APP
1619; CHECK-NEXT:    nop
1620; CHECK-NEXT:    #NO_APP
1621; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1622; CHECK-NEXT:    retq
1623  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1624  %2 = fneg half %a1
1625  %3 = fneg half %a2
1626  %4 = call half @llvm.fma.f16(half %2, half %a0, half %3)
1627  ret half %4
1628}
1629
1630define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
1631; CHECK-LABEL: stack_fold_fnmsub231sh:
1632; CHECK:       # %bb.0:
1633; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1634; CHECK-NEXT:    #APP
1635; CHECK-NEXT:    nop
1636; CHECK-NEXT:    #NO_APP
1637; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1638; CHECK-NEXT:    retq
1639  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1640  %2 = fneg half %a1
1641  %3 = fneg half %a0
1642  %4 = call half @llvm.fma.f16(half %2, half %a2, half %3)
1643  ret half %4
1644}
1645
1646define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
1647; CHECK-LABEL: stack_fold_fnmsub321sh:
1648; CHECK:       # %bb.0:
1649; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1650; CHECK-NEXT:    #APP
1651; CHECK-NEXT:    nop
1652; CHECK-NEXT:    #NO_APP
1653; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1654; CHECK-NEXT:    retq
1655  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1656  %2 = fneg half %a2
1657  %3 = fneg half %a0
1658  %4 = call half @llvm.fma.f16(half %2, half %a1, half %3)
1659  ret half %4
1660}
1661
1662define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
1663; CHECK-LABEL: stack_fold_fnmsub132sh:
1664; CHECK:       # %bb.0:
1665; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1666; CHECK-NEXT:    #APP
1667; CHECK-NEXT:    nop
1668; CHECK-NEXT:    #NO_APP
1669; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1670; CHECK-NEXT:    retq
1671  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1672  %2 = fneg half %a0
1673  %3 = fneg half %a1
1674  %4 = call half @llvm.fma.f16(half %2, half %a2, half %3)
1675  ret half %4
1676}
1677
1678define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) {
1679; CHECK-LABEL: stack_fold_fnmsub312sh:
1680; CHECK:       # %bb.0:
1681; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1682; CHECK-NEXT:    #APP
1683; CHECK-NEXT:    nop
1684; CHECK-NEXT:    #NO_APP
1685; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1686; CHECK-NEXT:    retq
1687  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1688  %2 = fneg half %a2
1689  %3 = fneg half %a1
1690  %4 = call half @llvm.fma.f16(half %2, half %a0, half %3)
1691  ret half %4
1692}
1693
1694define <8 x half> @stack_fold_fmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1695; CHECK-LABEL: stack_fold_fmadd123sh_int:
1696; CHECK:       # %bb.0:
1697; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1698; CHECK-NEXT:    #APP
1699; CHECK-NEXT:    nop
1700; CHECK-NEXT:    #NO_APP
1701; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1702; CHECK-NEXT:    retq
1703  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1704  %a0 = extractelement <8 x half> %a0v, i64 0
1705  %a1 = extractelement <8 x half> %a1v, i64 0
1706  %a2 = extractelement <8 x half> %a2v, i64 0
1707  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
1708  %res = insertelement <8 x half> %a0v, half %2, i64 0
1709  ret <8 x half> %res
1710}
1711
1712define <8 x half> @stack_fold_fmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1713; CHECK-LABEL: stack_fold_fmadd213sh_int:
1714; CHECK:       # %bb.0:
1715; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1716; CHECK-NEXT:    #APP
1717; CHECK-NEXT:    nop
1718; CHECK-NEXT:    #NO_APP
1719; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1720; CHECK-NEXT:    retq
1721  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1722  %a0 = extractelement <8 x half> %a0v, i64 0
1723  %a1 = extractelement <8 x half> %a1v, i64 0
1724  %a2 = extractelement <8 x half> %a2v, i64 0
1725  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
1726  %res = insertelement <8 x half> %a0v, half %2, i64 0
1727  ret <8 x half> %res
1728}
1729
1730define <8 x half> @stack_fold_fmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1731; CHECK-LABEL: stack_fold_fmadd231sh_int:
1732; CHECK:       # %bb.0:
1733; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1734; CHECK-NEXT:    #APP
1735; CHECK-NEXT:    nop
1736; CHECK-NEXT:    #NO_APP
1737; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1738; CHECK-NEXT:    retq
1739  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1740  %a0 = extractelement <8 x half> %a0v, i64 0
1741  %a1 = extractelement <8 x half> %a1v, i64 0
1742  %a2 = extractelement <8 x half> %a2v, i64 0
1743  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
1744  %res = insertelement <8 x half> %a0v, half %2, i64 0
1745  ret <8 x half> %res
1746}
1747
1748define <8 x half> @stack_fold_fmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1749; CHECK-LABEL: stack_fold_fmadd321sh_int:
1750; CHECK:       # %bb.0:
1751; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1752; CHECK-NEXT:    #APP
1753; CHECK-NEXT:    nop
1754; CHECK-NEXT:    #NO_APP
1755; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1756; CHECK-NEXT:    retq
1757  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1758  %a0 = extractelement <8 x half> %a0v, i64 0
1759  %a1 = extractelement <8 x half> %a1v, i64 0
1760  %a2 = extractelement <8 x half> %a2v, i64 0
1761  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
1762  %res = insertelement <8 x half> %a0v, half %2, i64 0
1763  ret <8 x half> %res
1764}
1765
1766define <8 x half> @stack_fold_fmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1767; CHECK-LABEL: stack_fold_fmadd132sh_int:
1768; CHECK:       # %bb.0:
1769; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1770; CHECK-NEXT:    #APP
1771; CHECK-NEXT:    nop
1772; CHECK-NEXT:    #NO_APP
1773; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1774; CHECK-NEXT:    retq
1775  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1776  %a0 = extractelement <8 x half> %a0v, i64 0
1777  %a1 = extractelement <8 x half> %a1v, i64 0
1778  %a2 = extractelement <8 x half> %a2v, i64 0
1779  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
1780  %res = insertelement <8 x half> %a0v, half %2, i64 0
1781  ret <8 x half> %res
1782}
1783
1784define <8 x half> @stack_fold_fmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1785; CHECK-LABEL: stack_fold_fmadd312sh_int:
1786; CHECK:       # %bb.0:
1787; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1788; CHECK-NEXT:    #APP
1789; CHECK-NEXT:    nop
1790; CHECK-NEXT:    #NO_APP
1791; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1792; CHECK-NEXT:    retq
1793  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1794  %a0 = extractelement <8 x half> %a0v, i64 0
1795  %a1 = extractelement <8 x half> %a1v, i64 0
1796  %a2 = extractelement <8 x half> %a2v, i64 0
1797  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
1798  %res = insertelement <8 x half> %a0v, half %2, i64 0
1799  ret <8 x half> %res
1800}
1801
1802define <8 x half> @stack_fold_fmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1803; CHECK-LABEL: stack_fold_fmsub123sh_int:
1804; CHECK:       # %bb.0:
1805; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1806; CHECK-NEXT:    #APP
1807; CHECK-NEXT:    nop
1808; CHECK-NEXT:    #NO_APP
1809; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1810; CHECK-NEXT:    retq
1811  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1812  %a0 = extractelement <8 x half> %a0v, i64 0
1813  %a1 = extractelement <8 x half> %a1v, i64 0
1814  %a2 = extractelement <8 x half> %a2v, i64 0
1815  %neg = fneg half %a2
1816  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
1817  %res = insertelement <8 x half> %a0v, half %2, i64 0
1818  ret <8 x half> %res
1819}
1820
1821define <8 x half> @stack_fold_fmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1822; CHECK-LABEL: stack_fold_fmsub213sh_int:
1823; CHECK:       # %bb.0:
1824; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1825; CHECK-NEXT:    #APP
1826; CHECK-NEXT:    nop
1827; CHECK-NEXT:    #NO_APP
1828; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1829; CHECK-NEXT:    retq
1830  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1831  %a0 = extractelement <8 x half> %a0v, i64 0
1832  %a1 = extractelement <8 x half> %a1v, i64 0
1833  %a2 = extractelement <8 x half> %a2v, i64 0
1834  %neg = fneg half %a2
1835  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
1836  %res = insertelement <8 x half> %a0v, half %2, i64 0
1837  ret <8 x half> %res
1838}
1839
1840define <8 x half> @stack_fold_fmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1841; CHECK-LABEL: stack_fold_fmsub231sh_int:
1842; CHECK:       # %bb.0:
1843; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1844; CHECK-NEXT:    #APP
1845; CHECK-NEXT:    nop
1846; CHECK-NEXT:    #NO_APP
1847; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1848; CHECK-NEXT:    retq
1849  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1850  %a0 = extractelement <8 x half> %a0v, i64 0
1851  %a1 = extractelement <8 x half> %a1v, i64 0
1852  %a2 = extractelement <8 x half> %a2v, i64 0
1853  %neg = fneg half %a0
1854  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
1855  %res = insertelement <8 x half> %a0v, half %2, i64 0
1856  ret <8 x half> %res
1857}
1858
1859define <8 x half> @stack_fold_fmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1860; CHECK-LABEL: stack_fold_fmsub321sh_int:
1861; CHECK:       # %bb.0:
1862; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1863; CHECK-NEXT:    #APP
1864; CHECK-NEXT:    nop
1865; CHECK-NEXT:    #NO_APP
1866; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1867; CHECK-NEXT:    retq
1868  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1869  %a0 = extractelement <8 x half> %a0v, i64 0
1870  %a1 = extractelement <8 x half> %a1v, i64 0
1871  %a2 = extractelement <8 x half> %a2v, i64 0
1872  %neg = fneg half %a0
1873  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
1874  %res = insertelement <8 x half> %a0v, half %2, i64 0
1875  ret <8 x half> %res
1876}
1877
1878define <8 x half> @stack_fold_fmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1879; CHECK-LABEL: stack_fold_fmsub132sh_int:
1880; CHECK:       # %bb.0:
1881; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1882; CHECK-NEXT:    #APP
1883; CHECK-NEXT:    nop
1884; CHECK-NEXT:    #NO_APP
1885; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1886; CHECK-NEXT:    retq
1887  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1888  %a0 = extractelement <8 x half> %a0v, i64 0
1889  %a1 = extractelement <8 x half> %a1v, i64 0
1890  %a2 = extractelement <8 x half> %a2v, i64 0
1891  %neg = fneg half %a1
1892  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
1893  %res = insertelement <8 x half> %a0v, half %2, i64 0
1894  ret <8 x half> %res
1895}
1896
1897define <8 x half> @stack_fold_fmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1898; CHECK-LABEL: stack_fold_fmsub312sh_int:
1899; CHECK:       # %bb.0:
1900; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1901; CHECK-NEXT:    #APP
1902; CHECK-NEXT:    nop
1903; CHECK-NEXT:    #NO_APP
1904; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1905; CHECK-NEXT:    retq
1906  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1907  %a0 = extractelement <8 x half> %a0v, i64 0
1908  %a1 = extractelement <8 x half> %a1v, i64 0
1909  %a2 = extractelement <8 x half> %a2v, i64 0
1910  %neg = fneg half %a1
1911  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
1912  %res = insertelement <8 x half> %a0v, half %2, i64 0
1913  ret <8 x half> %res
1914}
1915
1916define <8 x half> @stack_fold_fnmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1917; CHECK-LABEL: stack_fold_fnmadd123sh_int:
1918; CHECK:       # %bb.0:
1919; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1920; CHECK-NEXT:    #APP
1921; CHECK-NEXT:    nop
1922; CHECK-NEXT:    #NO_APP
1923; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1924; CHECK-NEXT:    retq
1925  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1926  %a0 = extractelement <8 x half> %a0v, i64 0
1927  %a1 = extractelement <8 x half> %a1v, i64 0
1928  %a2 = extractelement <8 x half> %a2v, i64 0
1929  %neg1 = fneg half %a0
1930  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
1931  %res = insertelement <8 x half> %a0v, half %2, i64 0
1932  ret <8 x half> %res
1933}
1934
1935define <8 x half> @stack_fold_fnmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1936; CHECK-LABEL: stack_fold_fnmadd213sh_int:
1937; CHECK:       # %bb.0:
1938; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1939; CHECK-NEXT:    #APP
1940; CHECK-NEXT:    nop
1941; CHECK-NEXT:    #NO_APP
1942; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1943; CHECK-NEXT:    retq
1944  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1945  %a0 = extractelement <8 x half> %a0v, i64 0
1946  %a1 = extractelement <8 x half> %a1v, i64 0
1947  %a2 = extractelement <8 x half> %a2v, i64 0
1948  %neg1 = fneg half %a1
1949  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
1950  %res = insertelement <8 x half> %a0v, half %2, i64 0
1951  ret <8 x half> %res
1952}
1953
1954define <8 x half> @stack_fold_fnmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1955; CHECK-LABEL: stack_fold_fnmadd231sh_int:
1956; CHECK:       # %bb.0:
1957; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1958; CHECK-NEXT:    #APP
1959; CHECK-NEXT:    nop
1960; CHECK-NEXT:    #NO_APP
1961; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1962; CHECK-NEXT:    retq
1963  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1964  %a0 = extractelement <8 x half> %a0v, i64 0
1965  %a1 = extractelement <8 x half> %a1v, i64 0
1966  %a2 = extractelement <8 x half> %a2v, i64 0
1967  %neg1 = fneg half %a1
1968  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
1969  %res = insertelement <8 x half> %a0v, half %2, i64 0
1970  ret <8 x half> %res
1971}
1972
1973define <8 x half> @stack_fold_fnmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1974; CHECK-LABEL: stack_fold_fnmadd321sh_int:
1975; CHECK:       # %bb.0:
1976; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1977; CHECK-NEXT:    #APP
1978; CHECK-NEXT:    nop
1979; CHECK-NEXT:    #NO_APP
1980; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1981; CHECK-NEXT:    retq
1982  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1983  %a0 = extractelement <8 x half> %a0v, i64 0
1984  %a1 = extractelement <8 x half> %a1v, i64 0
1985  %a2 = extractelement <8 x half> %a2v, i64 0
1986  %neg1 = fneg half %a2
1987  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
1988  %res = insertelement <8 x half> %a0v, half %2, i64 0
1989  ret <8 x half> %res
1990}
1991
1992define <8 x half> @stack_fold_fnmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1993; CHECK-LABEL: stack_fold_fnmadd132sh_int:
1994; CHECK:       # %bb.0:
1995; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1996; CHECK-NEXT:    #APP
1997; CHECK-NEXT:    nop
1998; CHECK-NEXT:    #NO_APP
1999; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2000; CHECK-NEXT:    retq
2001  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2002  %a0 = extractelement <8 x half> %a0v, i64 0
2003  %a1 = extractelement <8 x half> %a1v, i64 0
2004  %a2 = extractelement <8 x half> %a2v, i64 0
2005  %neg1 = fneg half %a0
2006  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
2007  %res = insertelement <8 x half> %a0v, half %2, i64 0
2008  ret <8 x half> %res
2009}
2010
2011define <8 x half> @stack_fold_fnmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2012; CHECK-LABEL: stack_fold_fnmadd312sh_int:
2013; CHECK:       # %bb.0:
2014; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2015; CHECK-NEXT:    #APP
2016; CHECK-NEXT:    nop
2017; CHECK-NEXT:    #NO_APP
2018; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2019; CHECK-NEXT:    retq
2020  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2021  %a0 = extractelement <8 x half> %a0v, i64 0
2022  %a1 = extractelement <8 x half> %a1v, i64 0
2023  %a2 = extractelement <8 x half> %a2v, i64 0
2024  %neg1 = fneg half %a2
2025  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
2026  %res = insertelement <8 x half> %a0v, half %2, i64 0
2027  ret <8 x half> %res
2028}
2029
2030define <8 x half> @stack_fold_fnmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2031; CHECK-LABEL: stack_fold_fnmsub123sh_int:
2032; CHECK:       # %bb.0:
2033; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2034; CHECK-NEXT:    #APP
2035; CHECK-NEXT:    nop
2036; CHECK-NEXT:    #NO_APP
2037; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2038; CHECK-NEXT:    retq
2039  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2040  %a0 = extractelement <8 x half> %a0v, i64 0
2041  %a1 = extractelement <8 x half> %a1v, i64 0
2042  %a2 = extractelement <8 x half> %a2v, i64 0
2043  %neg = fneg half %a2
2044  %neg1 = fneg half %a0
2045  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
2046  %res = insertelement <8 x half> %a0v, half %2, i64 0
2047  ret <8 x half> %res
2048}
2049
2050define <8 x half> @stack_fold_fnmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2051; CHECK-LABEL: stack_fold_fnmsub213sh_int:
2052; CHECK:       # %bb.0:
2053; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2054; CHECK-NEXT:    #APP
2055; CHECK-NEXT:    nop
2056; CHECK-NEXT:    #NO_APP
2057; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2058; CHECK-NEXT:    retq
2059  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2060  %a0 = extractelement <8 x half> %a0v, i64 0
2061  %a1 = extractelement <8 x half> %a1v, i64 0
2062  %a2 = extractelement <8 x half> %a2v, i64 0
2063  %neg = fneg half %a2
2064  %neg1 = fneg half %a1
2065  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
2066  %res = insertelement <8 x half> %a0v, half %2, i64 0
2067  ret <8 x half> %res
2068}
2069
2070define <8 x half> @stack_fold_fnmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2071; CHECK-LABEL: stack_fold_fnmsub231sh_int:
2072; CHECK:       # %bb.0:
2073; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2074; CHECK-NEXT:    #APP
2075; CHECK-NEXT:    nop
2076; CHECK-NEXT:    #NO_APP
2077; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2078; CHECK-NEXT:    retq
2079  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2080  %a0 = extractelement <8 x half> %a0v, i64 0
2081  %a1 = extractelement <8 x half> %a1v, i64 0
2082  %a2 = extractelement <8 x half> %a2v, i64 0
2083  %neg = fneg half %a0
2084  %neg1 = fneg half %a1
2085  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
2086  %res = insertelement <8 x half> %a0v, half %2, i64 0
2087  ret <8 x half> %res
2088}
2089
2090define <8 x half> @stack_fold_fnmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2091; CHECK-LABEL: stack_fold_fnmsub321sh_int:
2092; CHECK:       # %bb.0:
2093; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2094; CHECK-NEXT:    #APP
2095; CHECK-NEXT:    nop
2096; CHECK-NEXT:    #NO_APP
2097; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2098; CHECK-NEXT:    retq
2099  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2100  %a0 = extractelement <8 x half> %a0v, i64 0
2101  %a1 = extractelement <8 x half> %a1v, i64 0
2102  %a2 = extractelement <8 x half> %a2v, i64 0
2103  %neg = fneg half %a0
2104  %neg1 = fneg half %a2
2105  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
2106  %res = insertelement <8 x half> %a0v, half %2, i64 0
2107  ret <8 x half> %res
2108}
2109
2110define <8 x half> @stack_fold_fnmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2111; CHECK-LABEL: stack_fold_fnmsub132sh_int:
2112; CHECK:       # %bb.0:
2113; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2114; CHECK-NEXT:    #APP
2115; CHECK-NEXT:    nop
2116; CHECK-NEXT:    #NO_APP
2117; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2118; CHECK-NEXT:    retq
2119  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2120  %a0 = extractelement <8 x half> %a0v, i64 0
2121  %a1 = extractelement <8 x half> %a1v, i64 0
2122  %a2 = extractelement <8 x half> %a2v, i64 0
2123  %neg = fneg half %a1
2124  %neg1 = fneg half %a0
2125  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
2126  %res = insertelement <8 x half> %a0v, half %2, i64 0
2127  ret <8 x half> %res
2128}
2129
2130define <8 x half> @stack_fold_fnmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2131; CHECK-LABEL: stack_fold_fnmsub312sh_int:
2132; CHECK:       # %bb.0:
2133; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2134; CHECK-NEXT:    #APP
2135; CHECK-NEXT:    nop
2136; CHECK-NEXT:    #NO_APP
2137; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2138; CHECK-NEXT:    retq
2139  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2140  %a0 = extractelement <8 x half> %a0v, i64 0
2141  %a1 = extractelement <8 x half> %a1v, i64 0
2142  %a2 = extractelement <8 x half> %a2v, i64 0
2143  %neg = fneg half %a1
2144  %neg1 = fneg half %a2
2145  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
2146  %res = insertelement <8 x half> %a0v, half %2, i64 0
2147  ret <8 x half> %res
2148}
2149
2150define <8 x half> @stack_fold_fmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2151; CHECK-LABEL: stack_fold_fmadd123sh_intk:
2152; CHECK:       # %bb.0:
2153; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2154; CHECK-NEXT:    #APP
2155; CHECK-NEXT:    nop
2156; CHECK-NEXT:    #NO_APP
2157; CHECK-NEXT:    kmovb (%rdi), %k1
2158; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2159; CHECK-NEXT:    retq
2160  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2161  %a0 = extractelement <8 x half> %a0v, i64 0
2162  %a1 = extractelement <8 x half> %a1v, i64 0
2163  %a2 = extractelement <8 x half> %a2v, i64 0
2164  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
2165  %3 = load i8, ptr %mask
2166  %4 = bitcast i8 %3 to <8 x i1>
2167  %5 = extractelement <8 x i1> %4, i64 0
2168  %6 = select i1 %5, half %2, half %a0
2169  %res = insertelement <8 x half> %a0v, half %6, i64 0
2170  ret <8 x half> %res
2171}
2172
2173define <8 x half> @stack_fold_fmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2174; CHECK-LABEL: stack_fold_fmadd213sh_intk:
2175; CHECK:       # %bb.0:
2176; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2177; CHECK-NEXT:    #APP
2178; CHECK-NEXT:    nop
2179; CHECK-NEXT:    #NO_APP
2180; CHECK-NEXT:    kmovb (%rdi), %k1
2181; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2182; CHECK-NEXT:    retq
2183  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2184  %a0 = extractelement <8 x half> %a0v, i64 0
2185  %a1 = extractelement <8 x half> %a1v, i64 0
2186  %a2 = extractelement <8 x half> %a2v, i64 0
2187  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
2188  %3 = load i8, ptr %mask
2189  %4 = bitcast i8 %3 to <8 x i1>
2190  %5 = extractelement <8 x i1> %4, i64 0
2191  %6 = select i1 %5, half %2, half %a0
2192  %res = insertelement <8 x half> %a0v, half %6, i64 0
2193  ret <8 x half> %res
2194}
2195
2196define <8 x half> @stack_fold_fmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2197; CHECK-LABEL: stack_fold_fmadd231sh_intk:
2198; CHECK:       # %bb.0:
2199; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2200; CHECK-NEXT:    #APP
2201; CHECK-NEXT:    nop
2202; CHECK-NEXT:    #NO_APP
2203; CHECK-NEXT:    kmovb (%rdi), %k1
2204; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2205; CHECK-NEXT:    retq
2206  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2207  %a0 = extractelement <8 x half> %a0v, i64 0
2208  %a1 = extractelement <8 x half> %a1v, i64 0
2209  %a2 = extractelement <8 x half> %a2v, i64 0
2210  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
2211  %3 = load i8, ptr %mask
2212  %4 = bitcast i8 %3 to <8 x i1>
2213  %5 = extractelement <8 x i1> %4, i64 0
2214  %6 = select i1 %5, half %2, half %a0
2215  %res = insertelement <8 x half> %a0v, half %6, i64 0
2216  ret <8 x half> %res
2217}
2218
2219define <8 x half> @stack_fold_fmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2220; CHECK-LABEL: stack_fold_fmadd321sh_intk:
2221; CHECK:       # %bb.0:
2222; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2223; CHECK-NEXT:    #APP
2224; CHECK-NEXT:    nop
2225; CHECK-NEXT:    #NO_APP
2226; CHECK-NEXT:    kmovb (%rdi), %k1
2227; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2228; CHECK-NEXT:    retq
2229  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2230  %a0 = extractelement <8 x half> %a0v, i64 0
2231  %a1 = extractelement <8 x half> %a1v, i64 0
2232  %a2 = extractelement <8 x half> %a2v, i64 0
2233  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
2234  %3 = load i8, ptr %mask
2235  %4 = bitcast i8 %3 to <8 x i1>
2236  %5 = extractelement <8 x i1> %4, i64 0
2237  %6 = select i1 %5, half %2, half %a0
2238  %res = insertelement <8 x half> %a0v, half %6, i64 0
2239  ret <8 x half> %res
2240}
2241
2242define <8 x half> @stack_fold_fmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2243; CHECK-LABEL: stack_fold_fmadd132sh_intk:
2244; CHECK:       # %bb.0:
2245; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2246; CHECK-NEXT:    #APP
2247; CHECK-NEXT:    nop
2248; CHECK-NEXT:    #NO_APP
2249; CHECK-NEXT:    kmovb (%rdi), %k1
2250; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2251; CHECK-NEXT:    retq
2252  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2253  %a0 = extractelement <8 x half> %a0v, i64 0
2254  %a1 = extractelement <8 x half> %a1v, i64 0
2255  %a2 = extractelement <8 x half> %a2v, i64 0
2256  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
2257  %3 = load i8, ptr %mask
2258  %4 = bitcast i8 %3 to <8 x i1>
2259  %5 = extractelement <8 x i1> %4, i64 0
2260  %6 = select i1 %5, half %2, half %a0
2261  %res = insertelement <8 x half> %a0v, half %6, i64 0
2262  ret <8 x half> %res
2263}
2264
2265define <8 x half> @stack_fold_fmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2266; CHECK-LABEL: stack_fold_fmadd312sh_intk:
2267; CHECK:       # %bb.0:
2268; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2269; CHECK-NEXT:    #APP
2270; CHECK-NEXT:    nop
2271; CHECK-NEXT:    #NO_APP
2272; CHECK-NEXT:    kmovb (%rdi), %k1
2273; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2274; CHECK-NEXT:    retq
2275  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2276  %a0 = extractelement <8 x half> %a0v, i64 0
2277  %a1 = extractelement <8 x half> %a1v, i64 0
2278  %a2 = extractelement <8 x half> %a2v, i64 0
2279  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
2280  %3 = load i8, ptr %mask
2281  %4 = bitcast i8 %3 to <8 x i1>
2282  %5 = extractelement <8 x i1> %4, i64 0
2283  %6 = select i1 %5, half %2, half %a0
2284  %res = insertelement <8 x half> %a0v, half %6, i64 0
2285  ret <8 x half> %res
2286}
2287
2288define <8 x half> @stack_fold_fmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2289; CHECK-LABEL: stack_fold_fmsub123sh_intk:
2290; CHECK:       # %bb.0:
2291; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2292; CHECK-NEXT:    #APP
2293; CHECK-NEXT:    nop
2294; CHECK-NEXT:    #NO_APP
2295; CHECK-NEXT:    kmovb (%rdi), %k1
2296; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2297; CHECK-NEXT:    retq
2298  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2299  %a0 = extractelement <8 x half> %a0v, i64 0
2300  %a1 = extractelement <8 x half> %a1v, i64 0
2301  %a2 = extractelement <8 x half> %a2v, i64 0
2302  %neg = fneg half %a2
2303  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
2304  %3 = load i8, ptr %mask
2305  %4 = bitcast i8 %3 to <8 x i1>
2306  %5 = extractelement <8 x i1> %4, i64 0
2307  %6 = select i1 %5, half %2, half %a0
2308  %res = insertelement <8 x half> %a0v, half %6, i64 0
2309  ret <8 x half> %res
2310}
2311
2312define <8 x half> @stack_fold_fmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2313; CHECK-LABEL: stack_fold_fmsub213sh_intk:
2314; CHECK:       # %bb.0:
2315; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2316; CHECK-NEXT:    #APP
2317; CHECK-NEXT:    nop
2318; CHECK-NEXT:    #NO_APP
2319; CHECK-NEXT:    kmovb (%rdi), %k1
2320; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2321; CHECK-NEXT:    retq
2322  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2323  %a0 = extractelement <8 x half> %a0v, i64 0
2324  %a1 = extractelement <8 x half> %a1v, i64 0
2325  %a2 = extractelement <8 x half> %a2v, i64 0
2326  %neg = fneg half %a2
2327  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
2328  %3 = load i8, ptr %mask
2329  %4 = bitcast i8 %3 to <8 x i1>
2330  %5 = extractelement <8 x i1> %4, i64 0
2331  %6 = select i1 %5, half %2, half %a0
2332  %res = insertelement <8 x half> %a0v, half %6, i64 0
2333  ret <8 x half> %res
2334}
2335
2336define <8 x half> @stack_fold_fmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2337; CHECK-LABEL: stack_fold_fmsub231sh_intk:
2338; CHECK:       # %bb.0:
2339; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2340; CHECK-NEXT:    #APP
2341; CHECK-NEXT:    nop
2342; CHECK-NEXT:    #NO_APP
2343; CHECK-NEXT:    kmovb (%rdi), %k1
2344; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2345; CHECK-NEXT:    retq
2346  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2347  %a0 = extractelement <8 x half> %a0v, i64 0
2348  %a1 = extractelement <8 x half> %a1v, i64 0
2349  %a2 = extractelement <8 x half> %a2v, i64 0
2350  %neg = fneg half %a0
2351  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
2352  %3 = load i8, ptr %mask
2353  %4 = bitcast i8 %3 to <8 x i1>
2354  %5 = extractelement <8 x i1> %4, i64 0
2355  %6 = select i1 %5, half %2, half %a0
2356  %res = insertelement <8 x half> %a0v, half %6, i64 0
2357  ret <8 x half> %res
2358}
2359
2360define <8 x half> @stack_fold_fmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2361; CHECK-LABEL: stack_fold_fmsub321sh_intk:
2362; CHECK:       # %bb.0:
2363; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2364; CHECK-NEXT:    #APP
2365; CHECK-NEXT:    nop
2366; CHECK-NEXT:    #NO_APP
2367; CHECK-NEXT:    kmovb (%rdi), %k1
2368; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2369; CHECK-NEXT:    retq
2370  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2371  %a0 = extractelement <8 x half> %a0v, i64 0
2372  %a1 = extractelement <8 x half> %a1v, i64 0
2373  %a2 = extractelement <8 x half> %a2v, i64 0
2374  %neg = fneg half %a0
2375  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
2376  %3 = load i8, ptr %mask
2377  %4 = bitcast i8 %3 to <8 x i1>
2378  %5 = extractelement <8 x i1> %4, i64 0
2379  %6 = select i1 %5, half %2, half %a0
2380  %res = insertelement <8 x half> %a0v, half %6, i64 0
2381  ret <8 x half> %res
2382}
2383
2384define <8 x half> @stack_fold_fmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2385; CHECK-LABEL: stack_fold_fmsub132sh_intk:
2386; CHECK:       # %bb.0:
2387; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2388; CHECK-NEXT:    #APP
2389; CHECK-NEXT:    nop
2390; CHECK-NEXT:    #NO_APP
2391; CHECK-NEXT:    kmovb (%rdi), %k1
2392; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2393; CHECK-NEXT:    retq
2394  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2395  %a0 = extractelement <8 x half> %a0v, i64 0
2396  %a1 = extractelement <8 x half> %a1v, i64 0
2397  %a2 = extractelement <8 x half> %a2v, i64 0
2398  %neg = fneg half %a1
2399  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
2400  %3 = load i8, ptr %mask
2401  %4 = bitcast i8 %3 to <8 x i1>
2402  %5 = extractelement <8 x i1> %4, i64 0
2403  %6 = select i1 %5, half %2, half %a0
2404  %res = insertelement <8 x half> %a0v, half %6, i64 0
2405  ret <8 x half> %res
2406}
2407
2408define <8 x half> @stack_fold_fmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2409; CHECK-LABEL: stack_fold_fmsub312sh_intk:
2410; CHECK:       # %bb.0:
2411; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2412; CHECK-NEXT:    #APP
2413; CHECK-NEXT:    nop
2414; CHECK-NEXT:    #NO_APP
2415; CHECK-NEXT:    kmovb (%rdi), %k1
2416; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2417; CHECK-NEXT:    retq
2418  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2419  %a0 = extractelement <8 x half> %a0v, i64 0
2420  %a1 = extractelement <8 x half> %a1v, i64 0
2421  %a2 = extractelement <8 x half> %a2v, i64 0
2422  %neg = fneg half %a1
2423  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
2424  %3 = load i8, ptr %mask
2425  %4 = bitcast i8 %3 to <8 x i1>
2426  %5 = extractelement <8 x i1> %4, i64 0
2427  %6 = select i1 %5, half %2, half %a0
2428  %res = insertelement <8 x half> %a0v, half %6, i64 0
2429  ret <8 x half> %res
2430}
2431
2432define <8 x half> @stack_fold_fnmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2433; CHECK-LABEL: stack_fold_fnmadd123sh_intk:
2434; CHECK:       # %bb.0:
2435; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2436; CHECK-NEXT:    #APP
2437; CHECK-NEXT:    nop
2438; CHECK-NEXT:    #NO_APP
2439; CHECK-NEXT:    kmovb (%rdi), %k1
2440; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2441; CHECK-NEXT:    retq
2442  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2443  %a0 = extractelement <8 x half> %a0v, i64 0
2444  %a1 = extractelement <8 x half> %a1v, i64 0
2445  %a2 = extractelement <8 x half> %a2v, i64 0
2446  %neg1 = fneg half %a0
2447  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
2448  %3 = load i8, ptr %mask
2449  %4 = bitcast i8 %3 to <8 x i1>
2450  %5 = extractelement <8 x i1> %4, i64 0
2451  %6 = select i1 %5, half %2, half %a0
2452  %res = insertelement <8 x half> %a0v, half %6, i64 0
2453  ret <8 x half> %res
2454}
2455
2456define <8 x half> @stack_fold_fnmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2457; CHECK-LABEL: stack_fold_fnmadd213sh_intk:
2458; CHECK:       # %bb.0:
2459; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2460; CHECK-NEXT:    #APP
2461; CHECK-NEXT:    nop
2462; CHECK-NEXT:    #NO_APP
2463; CHECK-NEXT:    kmovb (%rdi), %k1
2464; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2465; CHECK-NEXT:    retq
2466  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2467  %a0 = extractelement <8 x half> %a0v, i64 0
2468  %a1 = extractelement <8 x half> %a1v, i64 0
2469  %a2 = extractelement <8 x half> %a2v, i64 0
2470  %neg1 = fneg half %a1
2471  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
2472  %3 = load i8, ptr %mask
2473  %4 = bitcast i8 %3 to <8 x i1>
2474  %5 = extractelement <8 x i1> %4, i64 0
2475  %6 = select i1 %5, half %2, half %a0
2476  %res = insertelement <8 x half> %a0v, half %6, i64 0
2477  ret <8 x half> %res
2478}
2479
2480define <8 x half> @stack_fold_fnmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2481; CHECK-LABEL: stack_fold_fnmadd231sh_intk:
2482; CHECK:       # %bb.0:
2483; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2484; CHECK-NEXT:    #APP
2485; CHECK-NEXT:    nop
2486; CHECK-NEXT:    #NO_APP
2487; CHECK-NEXT:    kmovb (%rdi), %k1
2488; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2489; CHECK-NEXT:    retq
2490  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2491  %a0 = extractelement <8 x half> %a0v, i64 0
2492  %a1 = extractelement <8 x half> %a1v, i64 0
2493  %a2 = extractelement <8 x half> %a2v, i64 0
2494  %neg1 = fneg half %a1
2495  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
2496  %3 = load i8, ptr %mask
2497  %4 = bitcast i8 %3 to <8 x i1>
2498  %5 = extractelement <8 x i1> %4, i64 0
2499  %6 = select i1 %5, half %2, half %a0
2500  %res = insertelement <8 x half> %a0v, half %6, i64 0
2501  ret <8 x half> %res
2502}
2503
2504define <8 x half> @stack_fold_fnmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2505; CHECK-LABEL: stack_fold_fnmadd321sh_intk:
2506; CHECK:       # %bb.0:
2507; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2508; CHECK-NEXT:    #APP
2509; CHECK-NEXT:    nop
2510; CHECK-NEXT:    #NO_APP
2511; CHECK-NEXT:    kmovb (%rdi), %k1
2512; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2513; CHECK-NEXT:    retq
2514  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2515  %a0 = extractelement <8 x half> %a0v, i64 0
2516  %a1 = extractelement <8 x half> %a1v, i64 0
2517  %a2 = extractelement <8 x half> %a2v, i64 0
2518  %neg1 = fneg half %a2
2519  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
2520  %3 = load i8, ptr %mask
2521  %4 = bitcast i8 %3 to <8 x i1>
2522  %5 = extractelement <8 x i1> %4, i64 0
2523  %6 = select i1 %5, half %2, half %a0
2524  %res = insertelement <8 x half> %a0v, half %6, i64 0
2525  ret <8 x half> %res
2526}
2527
2528define <8 x half> @stack_fold_fnmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2529; CHECK-LABEL: stack_fold_fnmadd132sh_intk:
2530; CHECK:       # %bb.0:
2531; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2532; CHECK-NEXT:    #APP
2533; CHECK-NEXT:    nop
2534; CHECK-NEXT:    #NO_APP
2535; CHECK-NEXT:    kmovb (%rdi), %k1
2536; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2537; CHECK-NEXT:    retq
2538  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2539  %a0 = extractelement <8 x half> %a0v, i64 0
2540  %a1 = extractelement <8 x half> %a1v, i64 0
2541  %a2 = extractelement <8 x half> %a2v, i64 0
2542  %neg1 = fneg half %a0
2543  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
2544  %3 = load i8, ptr %mask
2545  %4 = bitcast i8 %3 to <8 x i1>
2546  %5 = extractelement <8 x i1> %4, i64 0
2547  %6 = select i1 %5, half %2, half %a0
2548  %res = insertelement <8 x half> %a0v, half %6, i64 0
2549  ret <8 x half> %res
2550}
2551
2552define <8 x half> @stack_fold_fnmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2553; CHECK-LABEL: stack_fold_fnmadd312sh_intk:
2554; CHECK:       # %bb.0:
2555; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2556; CHECK-NEXT:    #APP
2557; CHECK-NEXT:    nop
2558; CHECK-NEXT:    #NO_APP
2559; CHECK-NEXT:    kmovb (%rdi), %k1
2560; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2561; CHECK-NEXT:    retq
2562  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2563  %a0 = extractelement <8 x half> %a0v, i64 0
2564  %a1 = extractelement <8 x half> %a1v, i64 0
2565  %a2 = extractelement <8 x half> %a2v, i64 0
2566  %neg1 = fneg half %a2
2567  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
2568  %3 = load i8, ptr %mask
2569  %4 = bitcast i8 %3 to <8 x i1>
2570  %5 = extractelement <8 x i1> %4, i64 0
2571  %6 = select i1 %5, half %2, half %a0
2572  %res = insertelement <8 x half> %a0v, half %6, i64 0
2573  ret <8 x half> %res
2574}
2575
2576define <8 x half> @stack_fold_fnmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2577; CHECK-LABEL: stack_fold_fnmsub123sh_intk:
2578; CHECK:       # %bb.0:
2579; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2580; CHECK-NEXT:    #APP
2581; CHECK-NEXT:    nop
2582; CHECK-NEXT:    #NO_APP
2583; CHECK-NEXT:    kmovb (%rdi), %k1
2584; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2585; CHECK-NEXT:    retq
2586  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2587  %a0 = extractelement <8 x half> %a0v, i64 0
2588  %a1 = extractelement <8 x half> %a1v, i64 0
2589  %a2 = extractelement <8 x half> %a2v, i64 0
2590  %neg = fneg half %a2
2591  %neg1 = fneg half %a0
2592  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
2593  %3 = load i8, ptr %mask
2594  %4 = bitcast i8 %3 to <8 x i1>
2595  %5 = extractelement <8 x i1> %4, i64 0
2596  %6 = select i1 %5, half %2, half %a0
2597  %res = insertelement <8 x half> %a0v, half %6, i64 0
2598  ret <8 x half> %res
2599}
2600
2601define <8 x half> @stack_fold_fnmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2602; CHECK-LABEL: stack_fold_fnmsub213sh_intk:
2603; CHECK:       # %bb.0:
2604; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2605; CHECK-NEXT:    #APP
2606; CHECK-NEXT:    nop
2607; CHECK-NEXT:    #NO_APP
2608; CHECK-NEXT:    kmovb (%rdi), %k1
2609; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2610; CHECK-NEXT:    retq
2611  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2612  %a0 = extractelement <8 x half> %a0v, i64 0
2613  %a1 = extractelement <8 x half> %a1v, i64 0
2614  %a2 = extractelement <8 x half> %a2v, i64 0
2615  %neg = fneg half %a2
2616  %neg1 = fneg half %a1
2617  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
2618  %3 = load i8, ptr %mask
2619  %4 = bitcast i8 %3 to <8 x i1>
2620  %5 = extractelement <8 x i1> %4, i64 0
2621  %6 = select i1 %5, half %2, half %a0
2622  %res = insertelement <8 x half> %a0v, half %6, i64 0
2623  ret <8 x half> %res
2624}
2625
2626define <8 x half> @stack_fold_fnmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2627; CHECK-LABEL: stack_fold_fnmsub231sh_intk:
2628; CHECK:       # %bb.0:
2629; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2630; CHECK-NEXT:    #APP
2631; CHECK-NEXT:    nop
2632; CHECK-NEXT:    #NO_APP
2633; CHECK-NEXT:    kmovb (%rdi), %k1
2634; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2635; CHECK-NEXT:    retq
2636  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2637  %a0 = extractelement <8 x half> %a0v, i64 0
2638  %a1 = extractelement <8 x half> %a1v, i64 0
2639  %a2 = extractelement <8 x half> %a2v, i64 0
2640  %neg = fneg half %a0
2641  %neg1 = fneg half %a1
2642  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
2643  %3 = load i8, ptr %mask
2644  %4 = bitcast i8 %3 to <8 x i1>
2645  %5 = extractelement <8 x i1> %4, i64 0
2646  %6 = select i1 %5, half %2, half %a0
2647  %res = insertelement <8 x half> %a0v, half %6, i64 0
2648  ret <8 x half> %res
2649}
2650
2651define <8 x half> @stack_fold_fnmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2652; CHECK-LABEL: stack_fold_fnmsub321sh_intk:
2653; CHECK:       # %bb.0:
2654; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2655; CHECK-NEXT:    #APP
2656; CHECK-NEXT:    nop
2657; CHECK-NEXT:    #NO_APP
2658; CHECK-NEXT:    kmovb (%rdi), %k1
2659; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2660; CHECK-NEXT:    retq
2661  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2662  %a0 = extractelement <8 x half> %a0v, i64 0
2663  %a1 = extractelement <8 x half> %a1v, i64 0
2664  %a2 = extractelement <8 x half> %a2v, i64 0
2665  %neg = fneg half %a0
2666  %neg1 = fneg half %a2
2667  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
2668  %3 = load i8, ptr %mask
2669  %4 = bitcast i8 %3 to <8 x i1>
2670  %5 = extractelement <8 x i1> %4, i64 0
2671  %6 = select i1 %5, half %2, half %a0
2672  %res = insertelement <8 x half> %a0v, half %6, i64 0
2673  ret <8 x half> %res
2674}
2675
2676define <8 x half> @stack_fold_fnmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2677; CHECK-LABEL: stack_fold_fnmsub132sh_intk:
2678; CHECK:       # %bb.0:
2679; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2680; CHECK-NEXT:    #APP
2681; CHECK-NEXT:    nop
2682; CHECK-NEXT:    #NO_APP
2683; CHECK-NEXT:    kmovb (%rdi), %k1
2684; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2685; CHECK-NEXT:    retq
2686  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2687  %a0 = extractelement <8 x half> %a0v, i64 0
2688  %a1 = extractelement <8 x half> %a1v, i64 0
2689  %a2 = extractelement <8 x half> %a2v, i64 0
2690  %neg = fneg half %a1
2691  %neg1 = fneg half %a0
2692  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
2693  %3 = load i8, ptr %mask
2694  %4 = bitcast i8 %3 to <8 x i1>
2695  %5 = extractelement <8 x i1> %4, i64 0
2696  %6 = select i1 %5, half %2, half %a0
2697  %res = insertelement <8 x half> %a0v, half %6, i64 0
2698  ret <8 x half> %res
2699}
2700
2701define <8 x half> @stack_fold_fnmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2702; CHECK-LABEL: stack_fold_fnmsub312sh_intk:
2703; CHECK:       # %bb.0:
2704; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2705; CHECK-NEXT:    #APP
2706; CHECK-NEXT:    nop
2707; CHECK-NEXT:    #NO_APP
2708; CHECK-NEXT:    kmovb (%rdi), %k1
2709; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2710; CHECK-NEXT:    retq
2711  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2712  %a0 = extractelement <8 x half> %a0v, i64 0
2713  %a1 = extractelement <8 x half> %a1v, i64 0
2714  %a2 = extractelement <8 x half> %a2v, i64 0
2715  %neg = fneg half %a1
2716  %neg1 = fneg half %a2
2717  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
2718  %3 = load i8, ptr %mask
2719  %4 = bitcast i8 %3 to <8 x i1>
2720  %5 = extractelement <8 x i1> %4, i64 0
2721  %6 = select i1 %5, half %2, half %a0
2722  %res = insertelement <8 x half> %a0v, half %6, i64 0
2723  ret <8 x half> %res
2724}
2725
2726define <8 x half> @stack_fold_fmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2727; CHECK-LABEL: stack_fold_fmadd123sh_intkz:
2728; CHECK:       # %bb.0:
2729; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2730; CHECK-NEXT:    #APP
2731; CHECK-NEXT:    nop
2732; CHECK-NEXT:    #NO_APP
2733; CHECK-NEXT:    kmovb (%rdi), %k1
2734; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2735; CHECK-NEXT:    retq
2736  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2737  %a0 = extractelement <8 x half> %a0v, i64 0
2738  %a1 = extractelement <8 x half> %a1v, i64 0
2739  %a2 = extractelement <8 x half> %a2v, i64 0
2740  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
2741  %3 = load i8, ptr %mask
2742  %4 = bitcast i8 %3 to <8 x i1>
2743  %5 = extractelement <8 x i1> %4, i64 0
2744  %6 = select i1 %5, half %2, half zeroinitializer
2745  %res = insertelement <8 x half> %a0v, half %6, i64 0
2746  ret <8 x half> %res
2747}
2748
2749define <8 x half> @stack_fold_fmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2750; CHECK-LABEL: stack_fold_fmadd213sh_intkz:
2751; CHECK:       # %bb.0:
2752; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2753; CHECK-NEXT:    #APP
2754; CHECK-NEXT:    nop
2755; CHECK-NEXT:    #NO_APP
2756; CHECK-NEXT:    kmovb (%rdi), %k1
2757; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2758; CHECK-NEXT:    retq
2759  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2760  %a0 = extractelement <8 x half> %a0v, i64 0
2761  %a1 = extractelement <8 x half> %a1v, i64 0
2762  %a2 = extractelement <8 x half> %a2v, i64 0
2763  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
2764  %3 = load i8, ptr %mask
2765  %4 = bitcast i8 %3 to <8 x i1>
2766  %5 = extractelement <8 x i1> %4, i64 0
2767  %6 = select i1 %5, half %2, half zeroinitializer
2768  %res = insertelement <8 x half> %a0v, half %6, i64 0
2769  ret <8 x half> %res
2770}
2771
2772define <8 x half> @stack_fold_fmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2773; CHECK-LABEL: stack_fold_fmadd231sh_intkz:
2774; CHECK:       # %bb.0:
2775; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2776; CHECK-NEXT:    #APP
2777; CHECK-NEXT:    nop
2778; CHECK-NEXT:    #NO_APP
2779; CHECK-NEXT:    kmovb (%rdi), %k1
2780; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2781; CHECK-NEXT:    retq
2782  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2783  %a0 = extractelement <8 x half> %a0v, i64 0
2784  %a1 = extractelement <8 x half> %a1v, i64 0
2785  %a2 = extractelement <8 x half> %a2v, i64 0
2786  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
2787  %3 = load i8, ptr %mask
2788  %4 = bitcast i8 %3 to <8 x i1>
2789  %5 = extractelement <8 x i1> %4, i64 0
2790  %6 = select i1 %5, half %2, half zeroinitializer
2791  %res = insertelement <8 x half> %a0v, half %6, i64 0
2792  ret <8 x half> %res
2793}
2794
2795define <8 x half> @stack_fold_fmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2796; CHECK-LABEL: stack_fold_fmadd321sh_intkz:
2797; CHECK:       # %bb.0:
2798; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2799; CHECK-NEXT:    #APP
2800; CHECK-NEXT:    nop
2801; CHECK-NEXT:    #NO_APP
2802; CHECK-NEXT:    kmovb (%rdi), %k1
2803; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2804; CHECK-NEXT:    retq
2805  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2806  %a0 = extractelement <8 x half> %a0v, i64 0
2807  %a1 = extractelement <8 x half> %a1v, i64 0
2808  %a2 = extractelement <8 x half> %a2v, i64 0
2809  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
2810  %3 = load i8, ptr %mask
2811  %4 = bitcast i8 %3 to <8 x i1>
2812  %5 = extractelement <8 x i1> %4, i64 0
2813  %6 = select i1 %5, half %2, half zeroinitializer
2814  %res = insertelement <8 x half> %a0v, half %6, i64 0
2815  ret <8 x half> %res
2816}
2817
2818define <8 x half> @stack_fold_fmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2819; CHECK-LABEL: stack_fold_fmadd132sh_intkz:
2820; CHECK:       # %bb.0:
2821; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2822; CHECK-NEXT:    #APP
2823; CHECK-NEXT:    nop
2824; CHECK-NEXT:    #NO_APP
2825; CHECK-NEXT:    kmovb (%rdi), %k1
2826; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2827; CHECK-NEXT:    retq
2828  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2829  %a0 = extractelement <8 x half> %a0v, i64 0
2830  %a1 = extractelement <8 x half> %a1v, i64 0
2831  %a2 = extractelement <8 x half> %a2v, i64 0
2832  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
2833  %3 = load i8, ptr %mask
2834  %4 = bitcast i8 %3 to <8 x i1>
2835  %5 = extractelement <8 x i1> %4, i64 0
2836  %6 = select i1 %5, half %2, half zeroinitializer
2837  %res = insertelement <8 x half> %a0v, half %6, i64 0
2838  ret <8 x half> %res
2839}
2840
2841define <8 x half> @stack_fold_fmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2842; CHECK-LABEL: stack_fold_fmadd312sh_intkz:
2843; CHECK:       # %bb.0:
2844; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2845; CHECK-NEXT:    #APP
2846; CHECK-NEXT:    nop
2847; CHECK-NEXT:    #NO_APP
2848; CHECK-NEXT:    kmovb (%rdi), %k1
2849; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2850; CHECK-NEXT:    retq
2851  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2852  %a0 = extractelement <8 x half> %a0v, i64 0
2853  %a1 = extractelement <8 x half> %a1v, i64 0
2854  %a2 = extractelement <8 x half> %a2v, i64 0
2855  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
2856  %3 = load i8, ptr %mask
2857  %4 = bitcast i8 %3 to <8 x i1>
2858  %5 = extractelement <8 x i1> %4, i64 0
2859  %6 = select i1 %5, half %2, half zeroinitializer
2860  %res = insertelement <8 x half> %a0v, half %6, i64 0
2861  ret <8 x half> %res
2862}
2863
2864define <8 x half> @stack_fold_fmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2865; CHECK-LABEL: stack_fold_fmsub123sh_intkz:
2866; CHECK:       # %bb.0:
2867; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2868; CHECK-NEXT:    #APP
2869; CHECK-NEXT:    nop
2870; CHECK-NEXT:    #NO_APP
2871; CHECK-NEXT:    kmovb (%rdi), %k1
2872; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2873; CHECK-NEXT:    retq
2874  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2875  %a0 = extractelement <8 x half> %a0v, i64 0
2876  %a1 = extractelement <8 x half> %a1v, i64 0
2877  %a2 = extractelement <8 x half> %a2v, i64 0
2878  %neg = fneg half %a2
2879  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
2880  %3 = load i8, ptr %mask
2881  %4 = bitcast i8 %3 to <8 x i1>
2882  %5 = extractelement <8 x i1> %4, i64 0
2883  %6 = select i1 %5, half %2, half zeroinitializer
2884  %res = insertelement <8 x half> %a0v, half %6, i64 0
2885  ret <8 x half> %res
2886}
2887
2888define <8 x half> @stack_fold_fmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2889; CHECK-LABEL: stack_fold_fmsub213sh_intkz:
2890; CHECK:       # %bb.0:
2891; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2892; CHECK-NEXT:    #APP
2893; CHECK-NEXT:    nop
2894; CHECK-NEXT:    #NO_APP
2895; CHECK-NEXT:    kmovb (%rdi), %k1
2896; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2897; CHECK-NEXT:    retq
2898  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2899  %a0 = extractelement <8 x half> %a0v, i64 0
2900  %a1 = extractelement <8 x half> %a1v, i64 0
2901  %a2 = extractelement <8 x half> %a2v, i64 0
2902  %neg = fneg half %a2
2903  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
2904  %3 = load i8, ptr %mask
2905  %4 = bitcast i8 %3 to <8 x i1>
2906  %5 = extractelement <8 x i1> %4, i64 0
2907  %6 = select i1 %5, half %2, half zeroinitializer
2908  %res = insertelement <8 x half> %a0v, half %6, i64 0
2909  ret <8 x half> %res
2910}
2911
2912define <8 x half> @stack_fold_fmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2913; CHECK-LABEL: stack_fold_fmsub231sh_intkz:
2914; CHECK:       # %bb.0:
2915; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2916; CHECK-NEXT:    #APP
2917; CHECK-NEXT:    nop
2918; CHECK-NEXT:    #NO_APP
2919; CHECK-NEXT:    kmovb (%rdi), %k1
2920; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2921; CHECK-NEXT:    retq
2922  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2923  %a0 = extractelement <8 x half> %a0v, i64 0
2924  %a1 = extractelement <8 x half> %a1v, i64 0
2925  %a2 = extractelement <8 x half> %a2v, i64 0
2926  %neg = fneg half %a0
2927  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
2928  %3 = load i8, ptr %mask
2929  %4 = bitcast i8 %3 to <8 x i1>
2930  %5 = extractelement <8 x i1> %4, i64 0
2931  %6 = select i1 %5, half %2, half zeroinitializer
2932  %res = insertelement <8 x half> %a0v, half %6, i64 0
2933  ret <8 x half> %res
2934}
2935
2936define <8 x half> @stack_fold_fmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2937; CHECK-LABEL: stack_fold_fmsub321sh_intkz:
2938; CHECK:       # %bb.0:
2939; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2940; CHECK-NEXT:    #APP
2941; CHECK-NEXT:    nop
2942; CHECK-NEXT:    #NO_APP
2943; CHECK-NEXT:    kmovb (%rdi), %k1
2944; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2945; CHECK-NEXT:    retq
2946  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2947  %a0 = extractelement <8 x half> %a0v, i64 0
2948  %a1 = extractelement <8 x half> %a1v, i64 0
2949  %a2 = extractelement <8 x half> %a2v, i64 0
2950  %neg = fneg half %a0
2951  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
2952  %3 = load i8, ptr %mask
2953  %4 = bitcast i8 %3 to <8 x i1>
2954  %5 = extractelement <8 x i1> %4, i64 0
2955  %6 = select i1 %5, half %2, half zeroinitializer
2956  %res = insertelement <8 x half> %a0v, half %6, i64 0
2957  ret <8 x half> %res
2958}
2959
2960define <8 x half> @stack_fold_fmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2961; CHECK-LABEL: stack_fold_fmsub132sh_intkz:
2962; CHECK:       # %bb.0:
2963; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2964; CHECK-NEXT:    #APP
2965; CHECK-NEXT:    nop
2966; CHECK-NEXT:    #NO_APP
2967; CHECK-NEXT:    kmovb (%rdi), %k1
2968; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2969; CHECK-NEXT:    retq
2970  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2971  %a0 = extractelement <8 x half> %a0v, i64 0
2972  %a1 = extractelement <8 x half> %a1v, i64 0
2973  %a2 = extractelement <8 x half> %a2v, i64 0
2974  %neg = fneg half %a1
2975  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
2976  %3 = load i8, ptr %mask
2977  %4 = bitcast i8 %3 to <8 x i1>
2978  %5 = extractelement <8 x i1> %4, i64 0
2979  %6 = select i1 %5, half %2, half zeroinitializer
2980  %res = insertelement <8 x half> %a0v, half %6, i64 0
2981  ret <8 x half> %res
2982}
2983
2984define <8 x half> @stack_fold_fmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2985; CHECK-LABEL: stack_fold_fmsub312sh_intkz:
2986; CHECK:       # %bb.0:
2987; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2988; CHECK-NEXT:    #APP
2989; CHECK-NEXT:    nop
2990; CHECK-NEXT:    #NO_APP
2991; CHECK-NEXT:    kmovb (%rdi), %k1
2992; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2993; CHECK-NEXT:    retq
2994  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2995  %a0 = extractelement <8 x half> %a0v, i64 0
2996  %a1 = extractelement <8 x half> %a1v, i64 0
2997  %a2 = extractelement <8 x half> %a2v, i64 0
2998  %neg = fneg half %a1
2999  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
3000  %3 = load i8, ptr %mask
3001  %4 = bitcast i8 %3 to <8 x i1>
3002  %5 = extractelement <8 x i1> %4, i64 0
3003  %6 = select i1 %5, half %2, half zeroinitializer
3004  %res = insertelement <8 x half> %a0v, half %6, i64 0
3005  ret <8 x half> %res
3006}
3007
3008define <8 x half> @stack_fold_fnmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3009; CHECK-LABEL: stack_fold_fnmadd123sh_intkz:
3010; CHECK:       # %bb.0:
3011; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3012; CHECK-NEXT:    #APP
3013; CHECK-NEXT:    nop
3014; CHECK-NEXT:    #NO_APP
3015; CHECK-NEXT:    kmovb (%rdi), %k1
3016; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3017; CHECK-NEXT:    retq
3018  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3019  %a0 = extractelement <8 x half> %a0v, i64 0
3020  %a1 = extractelement <8 x half> %a1v, i64 0
3021  %a2 = extractelement <8 x half> %a2v, i64 0
3022  %neg1 = fneg half %a0
3023  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
3024  %3 = load i8, ptr %mask
3025  %4 = bitcast i8 %3 to <8 x i1>
3026  %5 = extractelement <8 x i1> %4, i64 0
3027  %6 = select i1 %5, half %2, half zeroinitializer
3028  %res = insertelement <8 x half> %a0v, half %6, i64 0
3029  ret <8 x half> %res
3030}
3031
3032define <8 x half> @stack_fold_fnmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3033; CHECK-LABEL: stack_fold_fnmadd213sh_intkz:
3034; CHECK:       # %bb.0:
3035; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3036; CHECK-NEXT:    #APP
3037; CHECK-NEXT:    nop
3038; CHECK-NEXT:    #NO_APP
3039; CHECK-NEXT:    kmovb (%rdi), %k1
3040; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3041; CHECK-NEXT:    retq
3042  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3043  %a0 = extractelement <8 x half> %a0v, i64 0
3044  %a1 = extractelement <8 x half> %a1v, i64 0
3045  %a2 = extractelement <8 x half> %a2v, i64 0
3046  %neg1 = fneg half %a1
3047  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
3048  %3 = load i8, ptr %mask
3049  %4 = bitcast i8 %3 to <8 x i1>
3050  %5 = extractelement <8 x i1> %4, i64 0
3051  %6 = select i1 %5, half %2, half zeroinitializer
3052  %res = insertelement <8 x half> %a0v, half %6, i64 0
3053  ret <8 x half> %res
3054}
3055
3056define <8 x half> @stack_fold_fnmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3057; CHECK-LABEL: stack_fold_fnmadd231sh_intkz:
3058; CHECK:       # %bb.0:
3059; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3060; CHECK-NEXT:    #APP
3061; CHECK-NEXT:    nop
3062; CHECK-NEXT:    #NO_APP
3063; CHECK-NEXT:    kmovb (%rdi), %k1
3064; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3065; CHECK-NEXT:    retq
3066  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3067  %a0 = extractelement <8 x half> %a0v, i64 0
3068  %a1 = extractelement <8 x half> %a1v, i64 0
3069  %a2 = extractelement <8 x half> %a2v, i64 0
3070  %neg1 = fneg half %a1
3071  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
3072  %3 = load i8, ptr %mask
3073  %4 = bitcast i8 %3 to <8 x i1>
3074  %5 = extractelement <8 x i1> %4, i64 0
3075  %6 = select i1 %5, half %2, half zeroinitializer
3076  %res = insertelement <8 x half> %a0v, half %6, i64 0
3077  ret <8 x half> %res
3078}
3079
3080define <8 x half> @stack_fold_fnmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3081; CHECK-LABEL: stack_fold_fnmadd321sh_intkz:
3082; CHECK:       # %bb.0:
3083; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3084; CHECK-NEXT:    #APP
3085; CHECK-NEXT:    nop
3086; CHECK-NEXT:    #NO_APP
3087; CHECK-NEXT:    kmovb (%rdi), %k1
3088; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3089; CHECK-NEXT:    retq
3090  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3091  %a0 = extractelement <8 x half> %a0v, i64 0
3092  %a1 = extractelement <8 x half> %a1v, i64 0
3093  %a2 = extractelement <8 x half> %a2v, i64 0
3094  %neg1 = fneg half %a2
3095  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
3096  %3 = load i8, ptr %mask
3097  %4 = bitcast i8 %3 to <8 x i1>
3098  %5 = extractelement <8 x i1> %4, i64 0
3099  %6 = select i1 %5, half %2, half zeroinitializer
3100  %res = insertelement <8 x half> %a0v, half %6, i64 0
3101  ret <8 x half> %res
3102}
3103
3104define <8 x half> @stack_fold_fnmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3105; CHECK-LABEL: stack_fold_fnmadd132sh_intkz:
3106; CHECK:       # %bb.0:
3107; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3108; CHECK-NEXT:    #APP
3109; CHECK-NEXT:    nop
3110; CHECK-NEXT:    #NO_APP
3111; CHECK-NEXT:    kmovb (%rdi), %k1
3112; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3113; CHECK-NEXT:    retq
3114  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3115  %a0 = extractelement <8 x half> %a0v, i64 0
3116  %a1 = extractelement <8 x half> %a1v, i64 0
3117  %a2 = extractelement <8 x half> %a2v, i64 0
3118  %neg1 = fneg half %a0
3119  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
3120  %3 = load i8, ptr %mask
3121  %4 = bitcast i8 %3 to <8 x i1>
3122  %5 = extractelement <8 x i1> %4, i64 0
3123  %6 = select i1 %5, half %2, half zeroinitializer
3124  %res = insertelement <8 x half> %a0v, half %6, i64 0
3125  ret <8 x half> %res
3126}
3127
3128define <8 x half> @stack_fold_fnmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3129; CHECK-LABEL: stack_fold_fnmadd312sh_intkz:
3130; CHECK:       # %bb.0:
3131; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3132; CHECK-NEXT:    #APP
3133; CHECK-NEXT:    nop
3134; CHECK-NEXT:    #NO_APP
3135; CHECK-NEXT:    kmovb (%rdi), %k1
3136; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3137; CHECK-NEXT:    retq
3138  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3139  %a0 = extractelement <8 x half> %a0v, i64 0
3140  %a1 = extractelement <8 x half> %a1v, i64 0
3141  %a2 = extractelement <8 x half> %a2v, i64 0
3142  %neg1 = fneg half %a2
3143  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
3144  %3 = load i8, ptr %mask
3145  %4 = bitcast i8 %3 to <8 x i1>
3146  %5 = extractelement <8 x i1> %4, i64 0
3147  %6 = select i1 %5, half %2, half zeroinitializer
3148  %res = insertelement <8 x half> %a0v, half %6, i64 0
3149  ret <8 x half> %res
3150}
3151
3152define <8 x half> @stack_fold_fnmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3153; CHECK-LABEL: stack_fold_fnmsub123sh_intkz:
3154; CHECK:       # %bb.0:
3155; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3156; CHECK-NEXT:    #APP
3157; CHECK-NEXT:    nop
3158; CHECK-NEXT:    #NO_APP
3159; CHECK-NEXT:    kmovb (%rdi), %k1
3160; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3161; CHECK-NEXT:    retq
3162  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3163  %a0 = extractelement <8 x half> %a0v, i64 0
3164  %a1 = extractelement <8 x half> %a1v, i64 0
3165  %a2 = extractelement <8 x half> %a2v, i64 0
3166  %neg = fneg half %a2
3167  %neg1 = fneg half %a0
3168  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
3169  %3 = load i8, ptr %mask
3170  %4 = bitcast i8 %3 to <8 x i1>
3171  %5 = extractelement <8 x i1> %4, i64 0
3172  %6 = select i1 %5, half %2, half zeroinitializer
3173  %res = insertelement <8 x half> %a0v, half %6, i64 0
3174  ret <8 x half> %res
3175}
3176
3177define <8 x half> @stack_fold_fnmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3178; CHECK-LABEL: stack_fold_fnmsub213sh_intkz:
3179; CHECK:       # %bb.0:
3180; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3181; CHECK-NEXT:    #APP
3182; CHECK-NEXT:    nop
3183; CHECK-NEXT:    #NO_APP
3184; CHECK-NEXT:    kmovb (%rdi), %k1
3185; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3186; CHECK-NEXT:    retq
3187  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3188  %a0 = extractelement <8 x half> %a0v, i64 0
3189  %a1 = extractelement <8 x half> %a1v, i64 0
3190  %a2 = extractelement <8 x half> %a2v, i64 0
3191  %neg = fneg half %a2
3192  %neg1 = fneg half %a1
3193  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
3194  %3 = load i8, ptr %mask
3195  %4 = bitcast i8 %3 to <8 x i1>
3196  %5 = extractelement <8 x i1> %4, i64 0
3197  %6 = select i1 %5, half %2, half zeroinitializer
3198  %res = insertelement <8 x half> %a0v, half %6, i64 0
3199  ret <8 x half> %res
3200}
3201
3202define <8 x half> @stack_fold_fnmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3203; CHECK-LABEL: stack_fold_fnmsub231sh_intkz:
3204; CHECK:       # %bb.0:
3205; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3206; CHECK-NEXT:    #APP
3207; CHECK-NEXT:    nop
3208; CHECK-NEXT:    #NO_APP
3209; CHECK-NEXT:    kmovb (%rdi), %k1
3210; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3211; CHECK-NEXT:    retq
3212  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3213  %a0 = extractelement <8 x half> %a0v, i64 0
3214  %a1 = extractelement <8 x half> %a1v, i64 0
3215  %a2 = extractelement <8 x half> %a2v, i64 0
3216  %neg = fneg half %a0
3217  %neg1 = fneg half %a1
3218  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
3219  %3 = load i8, ptr %mask
3220  %4 = bitcast i8 %3 to <8 x i1>
3221  %5 = extractelement <8 x i1> %4, i64 0
3222  %6 = select i1 %5, half %2, half zeroinitializer
3223  %res = insertelement <8 x half> %a0v, half %6, i64 0
3224  ret <8 x half> %res
3225}
3226
3227define <8 x half> @stack_fold_fnmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3228; CHECK-LABEL: stack_fold_fnmsub321sh_intkz:
3229; CHECK:       # %bb.0:
3230; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3231; CHECK-NEXT:    #APP
3232; CHECK-NEXT:    nop
3233; CHECK-NEXT:    #NO_APP
3234; CHECK-NEXT:    kmovb (%rdi), %k1
3235; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3236; CHECK-NEXT:    retq
3237  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3238  %a0 = extractelement <8 x half> %a0v, i64 0
3239  %a1 = extractelement <8 x half> %a1v, i64 0
3240  %a2 = extractelement <8 x half> %a2v, i64 0
3241  %neg = fneg half %a0
3242  %neg1 = fneg half %a2
3243  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
3244  %3 = load i8, ptr %mask
3245  %4 = bitcast i8 %3 to <8 x i1>
3246  %5 = extractelement <8 x i1> %4, i64 0
3247  %6 = select i1 %5, half %2, half zeroinitializer
3248  %res = insertelement <8 x half> %a0v, half %6, i64 0
3249  ret <8 x half> %res
3250}
3251
3252define <8 x half> @stack_fold_fnmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3253; CHECK-LABEL: stack_fold_fnmsub132sh_intkz:
3254; CHECK:       # %bb.0:
3255; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3256; CHECK-NEXT:    #APP
3257; CHECK-NEXT:    nop
3258; CHECK-NEXT:    #NO_APP
3259; CHECK-NEXT:    kmovb (%rdi), %k1
3260; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3261; CHECK-NEXT:    retq
3262  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3263  %a0 = extractelement <8 x half> %a0v, i64 0
3264  %a1 = extractelement <8 x half> %a1v, i64 0
3265  %a2 = extractelement <8 x half> %a2v, i64 0
3266  %neg = fneg half %a1
3267  %neg1 = fneg half %a0
3268  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
3269  %3 = load i8, ptr %mask
3270  %4 = bitcast i8 %3 to <8 x i1>
3271  %5 = extractelement <8 x i1> %4, i64 0
3272  %6 = select i1 %5, half %2, half zeroinitializer
3273  %res = insertelement <8 x half> %a0v, half %6, i64 0
3274  ret <8 x half> %res
3275}
3276
3277define <8 x half> @stack_fold_fnmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3278; CHECK-LABEL: stack_fold_fnmsub312sh_intkz:
3279; CHECK:       # %bb.0:
3280; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3281; CHECK-NEXT:    #APP
3282; CHECK-NEXT:    nop
3283; CHECK-NEXT:    #NO_APP
3284; CHECK-NEXT:    kmovb (%rdi), %k1
3285; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3286; CHECK-NEXT:    retq
3287  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3288  %a0 = extractelement <8 x half> %a0v, i64 0
3289  %a1 = extractelement <8 x half> %a1v, i64 0
3290  %a2 = extractelement <8 x half> %a2v, i64 0
3291  %neg = fneg half %a1
3292  %neg1 = fneg half %a2
3293  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
3294  %3 = load i8, ptr %mask
3295  %4 = bitcast i8 %3 to <8 x i1>
3296  %5 = extractelement <8 x i1> %4, i64 0
3297  %6 = select i1 %5, half %2, half zeroinitializer
3298  %res = insertelement <8 x half> %a0v, half %6, i64 0
3299  ret <8 x half> %res
3300}
3301
3302define <32 x half> @stack_fold_fmaddsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3303; CHECK-LABEL: stack_fold_fmaddsub123ph:
3304; CHECK:       # %bb.0:
3305; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3306; CHECK-NEXT:    #APP
3307; CHECK-NEXT:    nop
3308; CHECK-NEXT:    #NO_APP
3309; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3310; CHECK-NEXT:    retq
3311  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3312  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
3313  ret <32 x half> %2
3314}
3315declare <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half>, <32 x half>, <32 x half>, i32)
3316
3317define <32 x half> @stack_fold_fmaddsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3318; CHECK-LABEL: stack_fold_fmaddsub213ph:
3319; CHECK:       # %bb.0:
3320; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3321; CHECK-NEXT:    #APP
3322; CHECK-NEXT:    nop
3323; CHECK-NEXT:    #NO_APP
3324; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3325; CHECK-NEXT:    retq
3326  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3327  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
3328  ret <32 x half> %2
3329}
3330
3331define <32 x half> @stack_fold_fmaddsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3332; CHECK-LABEL: stack_fold_fmaddsub231ph:
3333; CHECK:       # %bb.0:
3334; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3335; CHECK-NEXT:    #APP
3336; CHECK-NEXT:    nop
3337; CHECK-NEXT:    #NO_APP
3338; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3339; CHECK-NEXT:    retq
3340  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3341  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
3342  ret <32 x half> %2
3343}
3344
3345define <32 x half> @stack_fold_fmaddsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3346; CHECK-LABEL: stack_fold_fmaddsub321ph:
3347; CHECK:       # %bb.0:
3348; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3349; CHECK-NEXT:    #APP
3350; CHECK-NEXT:    nop
3351; CHECK-NEXT:    #NO_APP
3352; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3353; CHECK-NEXT:    retq
3354  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3355  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
3356  ret <32 x half> %2
3357}
3358
3359define <32 x half> @stack_fold_fmaddsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3360; CHECK-LABEL: stack_fold_fmaddsub132ph:
3361; CHECK:       # %bb.0:
3362; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3363; CHECK-NEXT:    #APP
3364; CHECK-NEXT:    nop
3365; CHECK-NEXT:    #NO_APP
3366; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3367; CHECK-NEXT:    retq
3368  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3369  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
3370  ret <32 x half> %2
3371}
3372
3373define <32 x half> @stack_fold_fmaddsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3374; CHECK-LABEL: stack_fold_fmaddsub312ph:
3375; CHECK:       # %bb.0:
3376; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3377; CHECK-NEXT:    #APP
3378; CHECK-NEXT:    nop
3379; CHECK-NEXT:    #NO_APP
3380; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3381; CHECK-NEXT:    retq
3382  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3383  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
3384  ret <32 x half> %2
3385}
3386
3387define <32 x half> @stack_fold_fmaddsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3388; CHECK-LABEL: stack_fold_fmaddsub123ph_mask:
3389; CHECK:       # %bb.0:
3390; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3391; CHECK-NEXT:    #APP
3392; CHECK-NEXT:    nop
3393; CHECK-NEXT:    #NO_APP
3394; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3395; CHECK-NEXT:    kmovd %esi, %k1
3396; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3397; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3398; CHECK-NEXT:    retq
3399  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3400  %a0 = load <32 x half>, ptr %p
3401  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
3402  %3 = bitcast i32 %mask to <32 x i1>
3403  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3404  ret <32 x half> %4
3405}
3406
3407define <32 x half> @stack_fold_fmaddsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3408; CHECK-LABEL: stack_fold_fmaddsub213ph_mask:
3409; CHECK:       # %bb.0:
3410; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3411; CHECK-NEXT:    #APP
3412; CHECK-NEXT:    nop
3413; CHECK-NEXT:    #NO_APP
3414; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3415; CHECK-NEXT:    kmovd %esi, %k1
3416; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3417; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3418; CHECK-NEXT:    retq
3419  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3420  %a0 = load <32 x half>, ptr %p
3421  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
3422  %3 = bitcast i32 %mask to <32 x i1>
3423  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3424  ret <32 x half> %4
3425}
3426
3427define <32 x half> @stack_fold_fmaddsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3428; CHECK-LABEL: stack_fold_fmaddsub231ph_mask:
3429; CHECK:       # %bb.0:
3430; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3431; CHECK-NEXT:    #APP
3432; CHECK-NEXT:    nop
3433; CHECK-NEXT:    #NO_APP
3434; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3435; CHECK-NEXT:    kmovd %esi, %k1
3436; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3437; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3438; CHECK-NEXT:    retq
3439  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3440  %a0 = load <32 x half>, ptr %p
3441  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
3442  %3 = bitcast i32 %mask to <32 x i1>
3443  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3444  ret <32 x half> %4
3445}
3446
3447define <32 x half> @stack_fold_fmaddsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3448; CHECK-LABEL: stack_fold_fmaddsub321ph_mask:
3449; CHECK:       # %bb.0:
3450; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3451; CHECK-NEXT:    #APP
3452; CHECK-NEXT:    nop
3453; CHECK-NEXT:    #NO_APP
3454; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3455; CHECK-NEXT:    kmovd %esi, %k1
3456; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3457; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3458; CHECK-NEXT:    retq
3459  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3460  %a0 = load <32 x half>, ptr %p
3461  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
3462  %3 = bitcast i32 %mask to <32 x i1>
3463  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3464  ret <32 x half> %4
3465}
3466
3467define <32 x half> @stack_fold_fmaddsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3468; CHECK-LABEL: stack_fold_fmaddsub132ph_mask:
3469; CHECK:       # %bb.0:
3470; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3471; CHECK-NEXT:    #APP
3472; CHECK-NEXT:    nop
3473; CHECK-NEXT:    #NO_APP
3474; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3475; CHECK-NEXT:    kmovd %esi, %k1
3476; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3477; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3478; CHECK-NEXT:    retq
3479  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3480  %a0 = load <32 x half>, ptr %p
3481  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
3482  %3 = bitcast i32 %mask to <32 x i1>
3483  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3484  ret <32 x half> %4
3485}
3486
3487define <32 x half> @stack_fold_fmaddsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3488; CHECK-LABEL: stack_fold_fmaddsub312ph_mask:
3489; CHECK:       # %bb.0:
3490; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3491; CHECK-NEXT:    #APP
3492; CHECK-NEXT:    nop
3493; CHECK-NEXT:    #NO_APP
3494; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3495; CHECK-NEXT:    kmovd %esi, %k1
3496; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3497; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3498; CHECK-NEXT:    retq
3499  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3500  %a0 = load <32 x half>, ptr %p
3501  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
3502  %3 = bitcast i32 %mask to <32 x i1>
3503  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3504  ret <32 x half> %4
3505}
3506
3507define <32 x half> @stack_fold_fmaddsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3508; CHECK-LABEL: stack_fold_fmaddsub123ph_maskz:
3509; CHECK:       # %bb.0:
3510; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3511; CHECK-NEXT:    #APP
3512; CHECK-NEXT:    nop
3513; CHECK-NEXT:    #NO_APP
3514; CHECK-NEXT:    kmovd (%rdi), %k1
3515; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3516; CHECK-NEXT:    retq
3517  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3518  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
3519  %3 = load i32, ptr %mask
3520  %4 = bitcast i32 %3 to <32 x i1>
3521  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3522  ret <32 x half> %5
3523}
3524
3525define <32 x half> @stack_fold_fmaddsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3526; CHECK-LABEL: stack_fold_fmaddsub213ph_maskz:
3527; CHECK:       # %bb.0:
3528; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3529; CHECK-NEXT:    #APP
3530; CHECK-NEXT:    nop
3531; CHECK-NEXT:    #NO_APP
3532; CHECK-NEXT:    kmovd (%rdi), %k1
3533; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3534; CHECK-NEXT:    retq
3535  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3536  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
3537  %3 = load i32, ptr %mask
3538  %4 = bitcast i32 %3 to <32 x i1>
3539  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3540  ret <32 x half> %5
3541}
3542
3543define <32 x half> @stack_fold_fmaddsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3544; CHECK-LABEL: stack_fold_fmaddsub231ph_maskz:
3545; CHECK:       # %bb.0:
3546; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3547; CHECK-NEXT:    #APP
3548; CHECK-NEXT:    nop
3549; CHECK-NEXT:    #NO_APP
3550; CHECK-NEXT:    kmovd (%rdi), %k1
3551; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3552; CHECK-NEXT:    retq
3553  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3554  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
3555  %3 = load i32, ptr %mask
3556  %4 = bitcast i32 %3 to <32 x i1>
3557  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3558  ret <32 x half> %5
3559}
3560
3561define <32 x half> @stack_fold_fmaddsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3562; CHECK-LABEL: stack_fold_fmaddsub321ph_maskz:
3563; CHECK:       # %bb.0:
3564; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3565; CHECK-NEXT:    #APP
3566; CHECK-NEXT:    nop
3567; CHECK-NEXT:    #NO_APP
3568; CHECK-NEXT:    kmovd (%rdi), %k1
3569; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3570; CHECK-NEXT:    retq
3571  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3572  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
3573  %3 = load i32, ptr %mask
3574  %4 = bitcast i32 %3 to <32 x i1>
3575  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3576  ret <32 x half> %5
3577}
3578
3579define <32 x half> @stack_fold_fmaddsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3580; CHECK-LABEL: stack_fold_fmaddsub132ph_maskz:
3581; CHECK:       # %bb.0:
3582; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3583; CHECK-NEXT:    #APP
3584; CHECK-NEXT:    nop
3585; CHECK-NEXT:    #NO_APP
3586; CHECK-NEXT:    kmovd (%rdi), %k1
3587; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3588; CHECK-NEXT:    retq
3589  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3590  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
3591  %3 = load i32, ptr %mask
3592  %4 = bitcast i32 %3 to <32 x i1>
3593  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3594  ret <32 x half> %5
3595}
3596
3597define <32 x half> @stack_fold_fmaddsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3598; CHECK-LABEL: stack_fold_fmaddsub312ph_maskz:
3599; CHECK:       # %bb.0:
3600; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3601; CHECK-NEXT:    #APP
3602; CHECK-NEXT:    nop
3603; CHECK-NEXT:    #NO_APP
3604; CHECK-NEXT:    kmovd (%rdi), %k1
3605; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3606; CHECK-NEXT:    retq
3607  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3608  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
3609  %3 = load i32, ptr %mask
3610  %4 = bitcast i32 %3 to <32 x i1>
3611  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3612  ret <32 x half> %5
3613}
3614
3615define <32 x half> @stack_fold_fmsubadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3616; CHECK-LABEL: stack_fold_fmsubadd123ph:
3617; CHECK:       # %bb.0:
3618; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3619; CHECK-NEXT:    #APP
3620; CHECK-NEXT:    nop
3621; CHECK-NEXT:    #NO_APP
3622; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3623; CHECK-NEXT:    retq
3624  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3625  %2 = fneg <32 x half> %a2
3626  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %2, i32 4)
3627  ret <32 x half> %3
3628}
3629
3630define <32 x half> @stack_fold_fmsubadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3631; CHECK-LABEL: stack_fold_fmsubadd213ph:
3632; CHECK:       # %bb.0:
3633; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3634; CHECK-NEXT:    #APP
3635; CHECK-NEXT:    nop
3636; CHECK-NEXT:    #NO_APP
3637; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3638; CHECK-NEXT:    retq
3639  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3640  %2 = fneg <32 x half> %a2
3641  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %2, i32 4)
3642  ret <32 x half> %3
3643}
3644
3645define <32 x half> @stack_fold_fmsubadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3646; CHECK-LABEL: stack_fold_fmsubadd231ph:
3647; CHECK:       # %bb.0:
3648; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3649; CHECK-NEXT:    #APP
3650; CHECK-NEXT:    nop
3651; CHECK-NEXT:    #NO_APP
3652; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3653; CHECK-NEXT:    retq
3654  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3655  %2 = fneg <32 x half> %a0
3656  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %2, i32 4)
3657  ret <32 x half> %3
3658}
3659
3660define <32 x half> @stack_fold_fmsubadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3661; CHECK-LABEL: stack_fold_fmsubadd321ph:
3662; CHECK:       # %bb.0:
3663; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3664; CHECK-NEXT:    #APP
3665; CHECK-NEXT:    nop
3666; CHECK-NEXT:    #NO_APP
3667; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3668; CHECK-NEXT:    retq
3669  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3670  %2 = fneg <32 x half> %a0
3671  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %2, i32 4)
3672  ret <32 x half> %3
3673}
3674
3675define <32 x half> @stack_fold_fmsubadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3676; CHECK-LABEL: stack_fold_fmsubadd132ph:
3677; CHECK:       # %bb.0:
3678; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3679; CHECK-NEXT:    #APP
3680; CHECK-NEXT:    nop
3681; CHECK-NEXT:    #NO_APP
3682; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3683; CHECK-NEXT:    retq
3684  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3685  %2 = fneg <32 x half> %a1
3686  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %2, i32 4)
3687  ret <32 x half> %3
3688}
3689
3690define <32 x half> @stack_fold_fmsubadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3691; CHECK-LABEL: stack_fold_fmsubadd312ph:
3692; CHECK:       # %bb.0:
3693; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3694; CHECK-NEXT:    #APP
3695; CHECK-NEXT:    nop
3696; CHECK-NEXT:    #NO_APP
3697; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3698; CHECK-NEXT:    retq
3699  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3700  %2 = fneg <32 x half> %a1
3701  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %2, i32 4)
3702  ret <32 x half> %3
3703}
3704
3705define <32 x half> @stack_fold_fmsubadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3706; CHECK-LABEL: stack_fold_fmsubadd123ph_mask:
3707; CHECK:       # %bb.0:
3708; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3709; CHECK-NEXT:    #APP
3710; CHECK-NEXT:    nop
3711; CHECK-NEXT:    #NO_APP
3712; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3713; CHECK-NEXT:    kmovd %esi, %k1
3714; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3715; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3716; CHECK-NEXT:    retq
3717  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3718  %a0 = load <32 x half>, ptr %p
3719  %neg = fneg <32 x half> %a2
3720  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4)
3721  %3 = bitcast i32 %mask to <32 x i1>
3722  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3723  ret <32 x half> %4
3724}
3725
3726define <32 x half> @stack_fold_fmsubadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3727; CHECK-LABEL: stack_fold_fmsubadd213ph_mask:
3728; CHECK:       # %bb.0:
3729; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3730; CHECK-NEXT:    #APP
3731; CHECK-NEXT:    nop
3732; CHECK-NEXT:    #NO_APP
3733; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3734; CHECK-NEXT:    kmovd %esi, %k1
3735; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3736; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3737; CHECK-NEXT:    retq
3738  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3739  %a0 = load <32 x half>, ptr %p
3740  %neg = fneg <32 x half> %a2
3741  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4)
3742  %3 = bitcast i32 %mask to <32 x i1>
3743  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3744  ret <32 x half> %4
3745}
3746
3747define <32 x half> @stack_fold_fmsubadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3748; CHECK-LABEL: stack_fold_fmsubadd231ph_mask:
3749; CHECK:       # %bb.0:
3750; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3751; CHECK-NEXT:    #APP
3752; CHECK-NEXT:    nop
3753; CHECK-NEXT:    #NO_APP
3754; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3755; CHECK-NEXT:    kmovd %esi, %k1
3756; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3757; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3758; CHECK-NEXT:    retq
3759  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3760  %a0 = load <32 x half>, ptr %p
3761  %neg = fneg <32 x half> %a0
3762  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4)
3763  %3 = bitcast i32 %mask to <32 x i1>
3764  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3765  ret <32 x half> %4
3766}
3767
3768define <32 x half> @stack_fold_fmsubadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3769; CHECK-LABEL: stack_fold_fmsubadd321ph_mask:
3770; CHECK:       # %bb.0:
3771; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3772; CHECK-NEXT:    #APP
3773; CHECK-NEXT:    nop
3774; CHECK-NEXT:    #NO_APP
3775; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3776; CHECK-NEXT:    kmovd %esi, %k1
3777; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3778; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3779; CHECK-NEXT:    retq
3780  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3781  %a0 = load <32 x half>, ptr %p
3782  %neg = fneg <32 x half> %a0
3783  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4)
3784  %3 = bitcast i32 %mask to <32 x i1>
3785  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3786  ret <32 x half> %4
3787}
3788
3789define <32 x half> @stack_fold_fmsubadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3790; CHECK-LABEL: stack_fold_fmsubadd132ph_mask:
3791; CHECK:       # %bb.0:
3792; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3793; CHECK-NEXT:    #APP
3794; CHECK-NEXT:    nop
3795; CHECK-NEXT:    #NO_APP
3796; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3797; CHECK-NEXT:    kmovd %esi, %k1
3798; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3799; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3800; CHECK-NEXT:    retq
3801  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3802  %a0 = load <32 x half>, ptr %p
3803  %neg = fneg <32 x half> %a1
3804  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4)
3805  %3 = bitcast i32 %mask to <32 x i1>
3806  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3807  ret <32 x half> %4
3808}
3809
3810define <32 x half> @stack_fold_fmsubadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3811; CHECK-LABEL: stack_fold_fmsubadd312ph_mask:
3812; CHECK:       # %bb.0:
3813; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3814; CHECK-NEXT:    #APP
3815; CHECK-NEXT:    nop
3816; CHECK-NEXT:    #NO_APP
3817; CHECK-NEXT:    vmovaps (%rdi), %zmm2
3818; CHECK-NEXT:    kmovd %esi, %k1
3819; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3820; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3821; CHECK-NEXT:    retq
3822  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3823  %a0 = load <32 x half>, ptr %p
3824  %neg = fneg <32 x half> %a1
3825  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4)
3826  %3 = bitcast i32 %mask to <32 x i1>
3827  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3828  ret <32 x half> %4
3829}
3830
3831define <32 x half> @stack_fold_fmsubadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3832; CHECK-LABEL: stack_fold_fmsubadd123ph_maskz:
3833; CHECK:       # %bb.0:
3834; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3835; CHECK-NEXT:    #APP
3836; CHECK-NEXT:    nop
3837; CHECK-NEXT:    #NO_APP
3838; CHECK-NEXT:    kmovd (%rdi), %k1
3839; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3840; CHECK-NEXT:    retq
3841  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3842  %neg = fneg <32 x half> %a2
3843  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4)
3844  %3 = load i32, ptr %mask
3845  %4 = bitcast i32 %3 to <32 x i1>
3846  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3847  ret <32 x half> %5
3848}
3849
3850define <32 x half> @stack_fold_fmsubadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3851; CHECK-LABEL: stack_fold_fmsubadd213ph_maskz:
3852; CHECK:       # %bb.0:
3853; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3854; CHECK-NEXT:    #APP
3855; CHECK-NEXT:    nop
3856; CHECK-NEXT:    #NO_APP
3857; CHECK-NEXT:    kmovd (%rdi), %k1
3858; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3859; CHECK-NEXT:    retq
3860  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3861  %neg = fneg <32 x half> %a2
3862  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4)
3863  %3 = load i32, ptr %mask
3864  %4 = bitcast i32 %3 to <32 x i1>
3865  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3866  ret <32 x half> %5
3867}
3868
3869define <32 x half> @stack_fold_fmsubadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3870; CHECK-LABEL: stack_fold_fmsubadd231ph_maskz:
3871; CHECK:       # %bb.0:
3872; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3873; CHECK-NEXT:    #APP
3874; CHECK-NEXT:    nop
3875; CHECK-NEXT:    #NO_APP
3876; CHECK-NEXT:    kmovd (%rdi), %k1
3877; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3878; CHECK-NEXT:    retq
3879  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3880  %neg = fneg <32 x half> %a0
3881  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4)
3882  %3 = load i32, ptr %mask
3883  %4 = bitcast i32 %3 to <32 x i1>
3884  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3885  ret <32 x half> %5
3886}
3887
3888define <32 x half> @stack_fold_fmsubadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3889; CHECK-LABEL: stack_fold_fmsubadd321ph_maskz:
3890; CHECK:       # %bb.0:
3891; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3892; CHECK-NEXT:    #APP
3893; CHECK-NEXT:    nop
3894; CHECK-NEXT:    #NO_APP
3895; CHECK-NEXT:    kmovd (%rdi), %k1
3896; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3897; CHECK-NEXT:    retq
3898  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3899  %neg = fneg <32 x half> %a0
3900  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4)
3901  %3 = load i32, ptr %mask
3902  %4 = bitcast i32 %3 to <32 x i1>
3903  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3904  ret <32 x half> %5
3905}
3906
3907define <32 x half> @stack_fold_fmsubadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3908; CHECK-LABEL: stack_fold_fmsubadd132ph_maskz:
3909; CHECK:       # %bb.0:
3910; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3911; CHECK-NEXT:    #APP
3912; CHECK-NEXT:    nop
3913; CHECK-NEXT:    #NO_APP
3914; CHECK-NEXT:    kmovd (%rdi), %k1
3915; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3916; CHECK-NEXT:    retq
3917  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3918  %neg = fneg <32 x half> %a1
3919  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4)
3920  %3 = load i32, ptr %mask
3921  %4 = bitcast i32 %3 to <32 x i1>
3922  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3923  ret <32 x half> %5
3924}
3925
3926define <32 x half> @stack_fold_fmsubadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3927; CHECK-LABEL: stack_fold_fmsubadd312ph_maskz:
3928; CHECK:       # %bb.0:
3929; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3930; CHECK-NEXT:    #APP
3931; CHECK-NEXT:    nop
3932; CHECK-NEXT:    #NO_APP
3933; CHECK-NEXT:    kmovd (%rdi), %k1
3934; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3935; CHECK-NEXT:    retq
3936  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3937  %neg = fneg <32 x half> %a1
3938  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4)
3939  %3 = load i32, ptr %mask
3940  %4 = bitcast i32 %3 to <32 x i1>
3941  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3942  ret <32 x half> %5
3943}
3944