xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fneg-combines.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; RUN: llc -mtriple=amdgcn -mcpu=hawaii -start-before=amdgpu-unify-divergent-exit-nodes -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI %s
2; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-before=amdgpu-unify-divergent-exit-nodes < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI %s
3
4; RUN: llc -mtriple=amdgcn -mcpu=fiji -start-before=amdgpu-unify-divergent-exit-nodes < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI %s
5; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji -start-before=amdgpu-unify-divergent-exit-nodes < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI %s
6
7; --------------------------------------------------------------------------------
8; fadd tests
9; --------------------------------------------------------------------------------
10
11; GCN-LABEL: {{^}}v_fneg_add_f32:
12; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
13; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
14
15; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
16; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
17
18; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
19; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
20define amdgpu_kernel void @v_fneg_add_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
21  %tid = call i32 @llvm.amdgcn.workitem.id.x()
22  %tid.ext = sext i32 %tid to i64
23  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
24  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
25  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
26  %a = load volatile float, ptr addrspace(1) %a.gep
27  %b = load volatile float, ptr addrspace(1) %b.gep
28  %add = fadd float %a, %b
29  %fneg = fneg float %add
30  store float %fneg, ptr addrspace(1) %out.gep
31  ret void
32}
33
34; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
35; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
36; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
37; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
38; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
39; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
40; GCN-NEXT: s_waitcnt vmcnt(0)
41; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
42; GCN-NEXT: s_waitcnt vmcnt(0)
43define amdgpu_kernel void @v_fneg_add_store_use_add_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
44  %tid = call i32 @llvm.amdgcn.workitem.id.x()
45  %tid.ext = sext i32 %tid to i64
46  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
47  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
48  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
49  %a = load volatile float, ptr addrspace(1) %a.gep
50  %b = load volatile float, ptr addrspace(1) %b.gep
51  %add = fadd float %a, %b
52  %fneg = fneg float %add
53  store volatile float %fneg, ptr addrspace(1) %out
54  store volatile float %add, ptr addrspace(1) %out
55  ret void
56}
57
58; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
59; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
60; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
61
62; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
63; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
64; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
65
66; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
67; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]
68
69; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
70; GCN-NEXT: s_waitcnt vmcnt(0)
71; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
72; GCN-NEXT: s_waitcnt vmcnt(0)
73define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
74  %tid = call i32 @llvm.amdgcn.workitem.id.x()
75  %tid.ext = sext i32 %tid to i64
76  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
77  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
78  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
79  %a = load volatile float, ptr addrspace(1) %a.gep
80  %b = load volatile float, ptr addrspace(1) %b.gep
81  %add = fadd float %a, %b
82  %fneg = fneg float %add
83  %use1 = fmul float %add, 4.0
84  store volatile float %fneg, ptr addrspace(1) %out
85  store volatile float %use1, ptr addrspace(1) %out
86  ret void
87}
88
89; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
90; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
91; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
92
93; GCN-SAFE: v_sub_f32_e32
94; GCN-SAFE: v_xor_b32_e32 [[ADD:v[0-9]+]], 0x80000000,
95
96; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
97
98; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
99define amdgpu_kernel void @v_fneg_add_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
100  %tid = call i32 @llvm.amdgcn.workitem.id.x()
101  %tid.ext = sext i32 %tid to i64
102  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
103  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
104  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
105  %a = load volatile float, ptr addrspace(1) %a.gep
106  %b = load volatile float, ptr addrspace(1) %b.gep
107  %fneg.a = fneg float %a
108  %add = fadd float %fneg.a, %b
109  %fneg = fneg float %add
110  store volatile float %fneg, ptr addrspace(1) %out
111  ret void
112}
113
114; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
115; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
116; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
117
118; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
119; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
120
121; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
122; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
123define amdgpu_kernel void @v_fneg_add_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
124  %tid = call i32 @llvm.amdgcn.workitem.id.x()
125  %tid.ext = sext i32 %tid to i64
126  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
127  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
128  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
129  %a = load volatile float, ptr addrspace(1) %a.gep
130  %b = load volatile float, ptr addrspace(1) %b.gep
131  %fneg.b = fneg float %b
132  %add = fadd float %a, %fneg.b
133  %fneg = fneg float %add
134  store volatile float %fneg, ptr addrspace(1) %out
135  ret void
136}
137
138; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
139; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
140; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
141
142; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
143; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
144
145; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
146; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
147define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
148  %tid = call i32 @llvm.amdgcn.workitem.id.x()
149  %tid.ext = sext i32 %tid to i64
150  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
151  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
152  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
153  %a = load volatile float, ptr addrspace(1) %a.gep
154  %b = load volatile float, ptr addrspace(1) %b.gep
155  %fneg.a = fneg float %a
156  %fneg.b = fneg float %b
157  %add = fadd float %fneg.a, %fneg.b
158  %fneg = fneg float %add
159  store volatile float %fneg, ptr addrspace(1) %out
160  ret void
161}
162
163; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
164; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
165; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
166
167; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
168; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
169; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
170
171; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
172; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
173; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
174; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
175; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
176; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
177define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
178  %tid = call i32 @llvm.amdgcn.workitem.id.x()
179  %tid.ext = sext i32 %tid to i64
180  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
181  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
182  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
183  %a = load volatile float, ptr addrspace(1) %a.gep
184  %b = load volatile float, ptr addrspace(1) %b.gep
185  %fneg.a = fneg float %a
186  %add = fadd float %fneg.a, %b
187  %fneg = fneg float %add
188  store volatile float %fneg, ptr addrspace(1) %out
189  store volatile float %fneg.a, ptr addrspace(1) %out
190  ret void
191}
192
193; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
194; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
195; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
196
197; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
198; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
199; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
200
201; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
202; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
203; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
204; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
205; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
206; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
207define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 {
208  %tid = call i32 @llvm.amdgcn.workitem.id.x()
209  %tid.ext = sext i32 %tid to i64
210  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
211  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
212  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
213  %a = load volatile float, ptr addrspace(1) %a.gep
214  %b = load volatile float, ptr addrspace(1) %b.gep
215  %fneg.a = fneg float %a
216  %add = fadd float %fneg.a, %b
217  %fneg = fneg float %add
218  %use1 = fmul float %fneg.a, %c
219  store volatile float %fneg, ptr addrspace(1) %out
220  store volatile float %use1, ptr addrspace(1) %out
221  ret void
222}
223
224; This one asserted with -enable-no-signed-zeros-fp-math
225; GCN-LABEL: {{^}}fneg_fadd_0:
226; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]],
227; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]]
228; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]]
229
230; GCN-NSZ-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0, v
231; GCN-NSZ: v_cmp_ngt_f32
232; GCN-NSZ: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
233define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
234.entry:
235  %tmp7 = fdiv float 1.000000e+00, %tmp6
236  %tmp8 = fmul float 0.000000e+00, %tmp7
237  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
238  %.i188 = fadd float %tmp9, 0.000000e+00
239  %tmp10 = fcmp uge float %.i188, %tmp2
240  %tmp11 = fneg float %.i188
241  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
242  %tmp12 = fcmp ule float %.i092, 0.000000e+00
243  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
244  ret float %.i198
245}
246
247; This is a workaround because -enable-no-signed-zeros-fp-math does not set up
248; function attribute unsafe-fp-math automatically. Combine with the previous test
249; when that is done.
250; GCN-LABEL: {{^}}fneg_fadd_0_nsz:
251; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]],
252; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]],
253; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fc00000
254; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 0, [[A]]
255; GCN-NSZ-DAG: v_cmp_ngt_f32_e32 {{.*}}, s{{[0-9]+}}, [[D]]
256; GCN-NSZ-DAG: v_cndmask_b32_e64 [[E:v[0-9]+]], -[[D]], v{{[0-9]+}},
257; GCN-NSZ-DAG: v_cmp_nlt_f32_e32 {{.*}}, 0
258; GCN-NSZ-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, [[C]], 0,
259define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 {
260.entry:
261  %tmp7 = fdiv afn float 1.000000e+00, %tmp6
262  %tmp8 = fmul float 0.000000e+00, %tmp7
263  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
264  %.i188 = fadd float %tmp9, 0.000000e+00
265  %tmp10 = fcmp uge float %.i188, %tmp2
266  %tmp11 = fneg float %.i188
267  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
268  %tmp12 = fcmp ule float %.i092, 0.000000e+00
269  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
270  ret float %.i198
271}
272
273; --------------------------------------------------------------------------------
274; fmul tests
275; --------------------------------------------------------------------------------
276
277; GCN-LABEL: {{^}}v_fneg_mul_f32:
278; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
279; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
280; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
281; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
282define amdgpu_kernel void @v_fneg_mul_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
283  %tid = call i32 @llvm.amdgcn.workitem.id.x()
284  %tid.ext = sext i32 %tid to i64
285  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
286  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
287  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
288  %a = load volatile float, ptr addrspace(1) %a.gep
289  %b = load volatile float, ptr addrspace(1) %b.gep
290  %mul = fmul float %a, %b
291  %fneg = fneg float %mul
292  store float %fneg, ptr addrspace(1) %out.gep
293  ret void
294}
295
296; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
297; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
298; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
299; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
300; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
301; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
302; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
303define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
304  %tid = call i32 @llvm.amdgcn.workitem.id.x()
305  %tid.ext = sext i32 %tid to i64
306  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
307  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
308  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
309  %a = load volatile float, ptr addrspace(1) %a.gep
310  %b = load volatile float, ptr addrspace(1) %b.gep
311  %mul = fmul float %a, %b
312  %fneg = fneg float %mul
313  store volatile float %fneg, ptr addrspace(1) %out
314  store volatile float %mul, ptr addrspace(1) %out
315  ret void
316}
317
318; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
319; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
320; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
321; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
322; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
323
324; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
325; GCN-NEXT: s_waitcnt vmcnt(0)
326; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
327; GCN-NEXT: s_waitcnt vmcnt(0)
328define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
329  %tid = call i32 @llvm.amdgcn.workitem.id.x()
330  %tid.ext = sext i32 %tid to i64
331  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
332  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
333  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
334  %a = load volatile float, ptr addrspace(1) %a.gep
335  %b = load volatile float, ptr addrspace(1) %b.gep
336  %mul = fmul float %a, %b
337  %fneg = fneg float %mul
338  %use1 = fmul float %mul, 4.0
339  store volatile float %fneg, ptr addrspace(1) %out
340  store volatile float %use1, ptr addrspace(1) %out
341  ret void
342}
343
344; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
345; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
346; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
347; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
348; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
349define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
350  %tid = call i32 @llvm.amdgcn.workitem.id.x()
351  %tid.ext = sext i32 %tid to i64
352  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
353  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
354  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
355  %a = load volatile float, ptr addrspace(1) %a.gep
356  %b = load volatile float, ptr addrspace(1) %b.gep
357  %fneg.a = fneg float %a
358  %mul = fmul float %fneg.a, %b
359  %fneg = fneg float %mul
360  store volatile float %fneg, ptr addrspace(1) %out
361  ret void
362}
363
364; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
365; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
366; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
367; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
368; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
369define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
370  %tid = call i32 @llvm.amdgcn.workitem.id.x()
371  %tid.ext = sext i32 %tid to i64
372  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
373  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
374  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
375  %a = load volatile float, ptr addrspace(1) %a.gep
376  %b = load volatile float, ptr addrspace(1) %b.gep
377  %fneg.b = fneg float %b
378  %mul = fmul float %a, %fneg.b
379  %fneg = fneg float %mul
380  store volatile float %fneg, ptr addrspace(1) %out
381  ret void
382}
383
384; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
385; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
386; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
387; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
388; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
389define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
390  %tid = call i32 @llvm.amdgcn.workitem.id.x()
391  %tid.ext = sext i32 %tid to i64
392  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
393  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
394  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
395  %a = load volatile float, ptr addrspace(1) %a.gep
396  %b = load volatile float, ptr addrspace(1) %b.gep
397  %fneg.a = fneg float %a
398  %fneg.b = fneg float %b
399  %mul = fmul float %fneg.a, %fneg.b
400  %fneg = fneg float %mul
401  store volatile float %fneg, ptr addrspace(1) %out
402  ret void
403}
404
405; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
406; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
407; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
408; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
409; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
410
411; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
412; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
413define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
414  %tid = call i32 @llvm.amdgcn.workitem.id.x()
415  %tid.ext = sext i32 %tid to i64
416  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
417  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
418  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
419  %a = load volatile float, ptr addrspace(1) %a.gep
420  %b = load volatile float, ptr addrspace(1) %b.gep
421  %fneg.a = fneg float %a
422  %mul = fmul float %fneg.a, %b
423  %fneg = fneg float %mul
424  store volatile float %fneg, ptr addrspace(1) %out
425  store volatile float %fneg.a, ptr addrspace(1) %out
426  ret void
427}
428
429; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
430; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
431; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
432; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
433; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
434; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
435; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
436define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 {
437  %tid = call i32 @llvm.amdgcn.workitem.id.x()
438  %tid.ext = sext i32 %tid to i64
439  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
440  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
441  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
442  %a = load volatile float, ptr addrspace(1) %a.gep
443  %b = load volatile float, ptr addrspace(1) %b.gep
444  %fneg.a = fneg float %a
445  %mul = fmul float %fneg.a, %b
446  %fneg = fneg float %mul
447  %use1 = fmul float %fneg.a, %c
448  store volatile float %fneg, ptr addrspace(1) %out
449  store volatile float %use1, ptr addrspace(1) %out
450  ret void
451}
452
453; --------------------------------------------------------------------------------
454; fminnum tests
455; --------------------------------------------------------------------------------
456
457; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee:
458; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
459; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
460; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
461; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
462; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
463; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
464define amdgpu_kernel void @v_fneg_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
465  %tid = call i32 @llvm.amdgcn.workitem.id.x()
466  %tid.ext = sext i32 %tid to i64
467  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
468  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
469  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
470  %a = load volatile float, ptr addrspace(1) %a.gep
471  %b = load volatile float, ptr addrspace(1) %b.gep
472  %min = call float @llvm.minnum.f32(float %a, float %b)
473  %fneg = fneg float %min
474  store float %fneg, ptr addrspace(1) %out.gep
475  ret void
476}
477
478; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee:
479; GCN-NOT: v0
480; GCN-NOT: v1
481; GCN: v_max_f32_e64 v0, -v0, -v1
482; GCN-NEXT: ; return
483define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 {
484  %min = call float @llvm.minnum.f32(float %a, float %b)
485  %fneg = fneg float %min
486  ret float %fneg
487}
488
489; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee:
490; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
491; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
492; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
493; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
494define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
495  %tid = call i32 @llvm.amdgcn.workitem.id.x()
496  %tid.ext = sext i32 %tid to i64
497  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
498  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
499  %a = load volatile float, ptr addrspace(1) %a.gep
500  %min = call float @llvm.minnum.f32(float %a, float %a)
501  %min.fneg = fneg float %min
502  store float %min.fneg, ptr addrspace(1) %out.gep
503  ret void
504}
505
506; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee:
507; GCN-NOT: v0
508; GCN: v_max_f32_e64 v0, -v0, -v0
509; GCN-NEXT: ; return
510define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 {
511  %min = call float @llvm.minnum.f32(float %a, float %a)
512  %min.fneg = fneg float %min
513  ret float %min.fneg
514}
515
516; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee:
517; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
518; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
519; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
520; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
521define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
522  %tid = call i32 @llvm.amdgcn.workitem.id.x()
523  %tid.ext = sext i32 %tid to i64
524  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
525  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
526  %a = load volatile float, ptr addrspace(1) %a.gep
527  %min = call float @llvm.minnum.f32(float 4.0, float %a)
528  %fneg = fneg float %min
529  store float %fneg, ptr addrspace(1) %out.gep
530  ret void
531}
532
533; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee:
534; GCN-NOT: v0
535; GCN: v_max_f32_e64 v0, -v0, -4.0
536; GCN-NEXT: ; return
537define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 {
538  %min = call float @llvm.minnum.f32(float 4.0, float %a)
539  %fneg = fneg float %min
540  ret float %fneg
541}
542
543; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee:
544; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
545; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
546; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
547; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
548define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
549  %tid = call i32 @llvm.amdgcn.workitem.id.x()
550  %tid.ext = sext i32 %tid to i64
551  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
552  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
553  %a = load volatile float, ptr addrspace(1) %a.gep
554  %min = call float @llvm.minnum.f32(float -4.0, float %a)
555  %fneg = fneg float %min
556  store float %fneg, ptr addrspace(1) %out.gep
557  ret void
558}
559
560; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee:
561; GCN-NOT: v0
562; GCN: v_max_f32_e64 v0, -v0, 4.0
563; GCN-NEXT: ; return
564define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 {
565  %min = call float @llvm.minnum.f32(float -4.0, float %a)
566  %fneg = fneg float %min
567  ret float %fneg
568}
569
570; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
571; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
572; GCN-NOT: [[A]]
573; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
574; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MIN]]
575; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
576define amdgpu_kernel void @v_fneg_0_minnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
577  %tid = call i32 @llvm.amdgcn.workitem.id.x()
578  %tid.ext = sext i32 %tid to i64
579  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
580  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
581  %a = load volatile float, ptr addrspace(1) %a.gep
582  %min = call nnan float @llvm.minnum.f32(float 0.0, float %a)
583  %fneg = fneg float %min
584  store float %fneg, ptr addrspace(1) %out.gep
585  ret void
586}
587
588; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee:
589; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
590; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
591; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
592; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
593define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
594  %tid = call i32 @llvm.amdgcn.workitem.id.x()
595  %tid.ext = sext i32 %tid to i64
596  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
597  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
598  %a = load volatile float, ptr addrspace(1) %a.gep
599  %min = call float @llvm.minnum.f32(float -0.0, float %a)
600  %fneg = fneg float %min
601  store float %fneg, ptr addrspace(1) %out.gep
602  ret void
603}
604
605; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32:
606; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
607
608; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
609; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
610
611; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
612; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
613; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
614
615; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
616define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
617  %tid = call i32 @llvm.amdgcn.workitem.id.x()
618  %tid.ext = sext i32 %tid to i64
619  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
620  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
621  %a = load volatile float, ptr addrspace(1) %a.gep
622  %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
623  %fneg = fneg float %min
624  store float %fneg, ptr addrspace(1) %out.gep
625  ret void
626}
627
628; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32:
629; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
630
631; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
632; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]]
633
634; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
635; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
636
637; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
638define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
639  %tid = call i32 @llvm.amdgcn.workitem.id.x()
640  %tid.ext = sext i32 %tid to i64
641  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
642  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
643  %a = load volatile float, ptr addrspace(1) %a.gep
644  %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a)
645  %fneg = fneg float %min
646  store float %fneg, ptr addrspace(1) %out.gep
647  ret void
648}
649
650; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f16:
651; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
652
653; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
654; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]]
655; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
656
657; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]]
658; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
659; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]]
660
661; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
662define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
663  %tid = call i32 @llvm.amdgcn.workitem.id.x()
664  %tid.ext = sext i32 %tid to i64
665  %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
666  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
667  %a = load volatile half, ptr addrspace(1) %a.gep
668  %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
669  %fneg = fsub half -0.000000e+00, %min
670  store half %fneg, ptr addrspace(1) %out.gep
671  ret void
672}
673
674; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f16:
675; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
676
677; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
678; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]]
679; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
680
681; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]]
682; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
683
684; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
685define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
686  %tid = call i32 @llvm.amdgcn.workitem.id.x()
687  %tid.ext = sext i32 %tid to i64
688  %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
689  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
690  %a = load volatile half, ptr addrspace(1) %a.gep
691  %min = call half @llvm.minnum.f16(half 0xHB118, half %a)
692  %fneg = fsub half -0.000000e+00, %min
693  store half %fneg, ptr addrspace(1) %out.gep
694  ret void
695}
696
697; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f64:
698; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
699
700; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30
701; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
702; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
703; SI: v_max_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]]
704
705; VI: v_min_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[A]], 0.15915494
706; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]]
707
708; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
709define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
710  %tid = call i32 @llvm.amdgcn.workitem.id.x()
711  %tid.ext = sext i32 %tid to i64
712  %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
713  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
714  %a = load volatile double, ptr addrspace(1) %a.gep
715  %min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a)
716  %fneg = fsub double -0.000000e+00, %min
717  store double %fneg, ptr addrspace(1) %out.gep
718  ret void
719}
720
721; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f64:
722; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
723
724; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30
725; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
726; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
727; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]]
728
729; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
730; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494
731
732; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
733define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
734  %tid = call i32 @llvm.amdgcn.workitem.id.x()
735  %tid.ext = sext i32 %tid to i64
736  %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
737  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
738  %a = load volatile double, ptr addrspace(1) %a.gep
739  %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a)
740  %fneg = fsub double -0.000000e+00, %min
741  store double %fneg, ptr addrspace(1) %out.gep
742  ret void
743}
744
745; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_no_ieee:
746; GCN-NOT: v0
747; GCN: v_max_f32_e64 v0, -v0, 0{{$}}
748; GCN-NEXT: ; return
749define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 {
750  %min = call float @llvm.minnum.f32(float -0.0, float %a)
751  %fneg = fneg float %min
752  ret float %fneg
753}
754
755; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee:
756; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
757; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
758; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
759; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]]
760; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
761; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
762define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
763  %tid = call i32 @llvm.amdgcn.workitem.id.x()
764  %tid.ext = sext i32 %tid to i64
765  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
766  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
767  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
768  %a = load volatile float, ptr addrspace(1) %a.gep
769  %b = load volatile float, ptr addrspace(1) %b.gep
770  %min = call float @llvm.minnum.f32(float 0.0, float %a)
771  %fneg = fneg float %min
772  %mul = fmul float %fneg, %b
773  store float %mul, ptr addrspace(1) %out.gep
774  ret void
775}
776
777; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32:
778; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
779; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
780
781; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
782
783; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
784; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]]
785
786; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
787; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]]
788; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
789
790; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
791define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
792  %tid = call i32 @llvm.amdgcn.workitem.id.x()
793  %tid.ext = sext i32 %tid to i64
794  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
795  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
796  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
797  %a = load volatile float, ptr addrspace(1) %a.gep
798  %b = load volatile float, ptr addrspace(1) %b.gep
799  %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
800  %fneg = fneg float %min
801  %mul = fmul float %fneg, %b
802  store float %mul, ptr addrspace(1) %out.gep
803  ret void
804}
805
806; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee:
807; GCN-NOT: v0
808; GCN-NOT: v1
809; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0
810; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1
811; GCN-NEXT: ; return
812define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
813  %min = call float @llvm.minnum.f32(float 0.0, float %a)
814  %fneg = fneg float %min
815  %mul = fmul float %fneg, %b
816  ret float %mul
817}
818
819; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee:
820; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
821; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
822; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
823; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
824; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
825; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
826; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
827; GCN-NEXT: s_waitcnt vmcnt(0)
828; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
829; GCN-NEXT: s_waitcnt vmcnt(0)
830define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
831  %tid = call i32 @llvm.amdgcn.workitem.id.x()
832  %tid.ext = sext i32 %tid to i64
833  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
834  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
835  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
836  %a = load volatile float, ptr addrspace(1) %a.gep
837  %b = load volatile float, ptr addrspace(1) %b.gep
838  %min = call float @llvm.minnum.f32(float %a, float %b)
839  %fneg = fneg float %min
840  %use1 = fmul float %min, 4.0
841  store volatile float %fneg, ptr addrspace(1) %out
842  store volatile float %use1, ptr addrspace(1) %out
843  ret void
844}
845
846; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee:
847; GCN-NOT: v0
848; GCN-NOT: v1
849; GCN: v_max_f32_e64 v0, -v0, -v1
850; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
851; GCN-NEXT: ; return
852define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 {
853  %min = call float @llvm.minnum.f32(float %a, float %b)
854  %fneg = fneg float %min
855  %use1 = fmul float %min, 4.0
856  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
857  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
858  ret <2 x float> %ins1
859}
860
861; --------------------------------------------------------------------------------
862; fmaxnum tests
863; --------------------------------------------------------------------------------
864
865
866; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee:
867; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
868; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
869; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
870; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
871; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
872; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
873define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
874  %tid = call i32 @llvm.amdgcn.workitem.id.x()
875  %tid.ext = sext i32 %tid to i64
876  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
877  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
878  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
879  %a = load volatile float, ptr addrspace(1) %a.gep
880  %b = load volatile float, ptr addrspace(1) %b.gep
881  %max = call float @llvm.maxnum.f32(float %a, float %b)
882  %fneg = fneg float %max
883  store float %fneg, ptr addrspace(1) %out.gep
884  ret void
885}
886
887; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee:
888; GCN-NOT: v0
889; GCN-NOT: v1
890; GCN: v_min_f32_e64 v0, -v0, -v1
891; GCN-NEXT: ; return
892define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 {
893  %max = call float @llvm.maxnum.f32(float %a, float %b)
894  %fneg = fneg float %max
895  ret float %fneg
896}
897
898; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee:
899; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
900; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
901; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
902; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
903define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
904  %tid = call i32 @llvm.amdgcn.workitem.id.x()
905  %tid.ext = sext i32 %tid to i64
906  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
907  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
908  %a = load volatile float, ptr addrspace(1) %a.gep
909  %max = call float @llvm.maxnum.f32(float %a, float %a)
910  %max.fneg = fneg float %max
911  store float %max.fneg, ptr addrspace(1) %out.gep
912  ret void
913}
914
915; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee:
916; GCN-NOT: v0
917; GCN: v_min_f32_e64 v0, -v0, -v0
918; GCN-NEXT: ; return
919define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 {
920  %max = call float @llvm.maxnum.f32(float %a, float %a)
921  %max.fneg = fneg float %max
922  ret float %max.fneg
923}
924
925; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee:
926; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
927; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
928; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
929; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
930define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
931  %tid = call i32 @llvm.amdgcn.workitem.id.x()
932  %tid.ext = sext i32 %tid to i64
933  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
934  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
935  %a = load volatile float, ptr addrspace(1) %a.gep
936  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
937  %fneg = fneg float %max
938  store float %fneg, ptr addrspace(1) %out.gep
939  ret void
940}
941
942; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee:
943; GCN-NOT: v0
944; GCN: v_min_f32_e64 v0, -v0, -4.0
945; GCN-NEXT: ; return
946define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 {
947  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
948  %fneg = fneg float %max
949  ret float %fneg
950}
951
952; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee:
953; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
954; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
955; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
956; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
957define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
958  %tid = call i32 @llvm.amdgcn.workitem.id.x()
959  %tid.ext = sext i32 %tid to i64
960  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
961  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
962  %a = load volatile float, ptr addrspace(1) %a.gep
963  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
964  %fneg = fneg float %max
965  store float %fneg, ptr addrspace(1) %out.gep
966  ret void
967}
968
969; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee:
970; GCN-NOT: v0
971; GCN: v_min_f32_e64 v0, -v0, 4.0
972; GCN-NEXT: ; return
973define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 {
974  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
975  %fneg = fneg float %max
976  ret float %fneg
977}
978
979; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
980; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
981; GCN-NOT: [[A]]
982; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
983; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
984; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
985define amdgpu_kernel void @v_fneg_0_maxnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
986  %tid = call i32 @llvm.amdgcn.workitem.id.x()
987  %tid.ext = sext i32 %tid to i64
988  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
989  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
990  %a = load volatile float, ptr addrspace(1) %a.gep
991  %max = call nnan float @llvm.maxnum.f32(float 0.0, float %a)
992  %fneg = fneg float %max
993  store float %fneg, ptr addrspace(1) %out.gep
994  ret void
995}
996
997; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee:
998; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
999; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
1000; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
1001; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1002define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1003  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1004  %tid.ext = sext i32 %tid to i64
1005  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1006  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1007  %a = load volatile float, ptr addrspace(1) %a.gep
1008  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
1009  %fneg = fneg float %max
1010  store float %fneg, ptr addrspace(1) %out.gep
1011  ret void
1012}
1013
1014; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee:
1015; GCN-NOT: v0
1016; GCN: v_min_f32_e64 v0, -v0, 0{{$}}
1017; GCN-NEXT: ; return
1018define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 {
1019  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
1020  %fneg = fneg float %max
1021  ret float %fneg
1022}
1023
1024; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee:
1025; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1026; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1027; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
1028; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
1029; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
1030; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1031define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1032  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1033  %tid.ext = sext i32 %tid to i64
1034  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1035  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1036  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1037  %a = load volatile float, ptr addrspace(1) %a.gep
1038  %b = load volatile float, ptr addrspace(1) %b.gep
1039  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
1040  %fneg = fneg float %max
1041  %mul = fmul float %fneg, %b
1042  store float %mul, ptr addrspace(1) %out.gep
1043  ret void
1044}
1045
1046; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee:
1047; GCN-NOT: v0
1048; GCN-NOT: v1
1049; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0
1050; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1
1051; GCN-NEXT: ; return
1052define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
1053  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
1054  %fneg = fneg float %max
1055  %mul = fmul float %fneg, %b
1056  ret float %mul
1057}
1058
1059; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee:
1060; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1061; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1062; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
1063; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
1064; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
1065; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
1066; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
1067; GCN-NEXT: s_waitcnt vmcnt(0)
1068; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
1069; GCN-NEXT: s_waitcnt vmcnt(0)
1070define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1071  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1072  %tid.ext = sext i32 %tid to i64
1073  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1074  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1075  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1076  %a = load volatile float, ptr addrspace(1) %a.gep
1077  %b = load volatile float, ptr addrspace(1) %b.gep
1078  %max = call float @llvm.maxnum.f32(float %a, float %b)
1079  %fneg = fneg float %max
1080  %use1 = fmul float %max, 4.0
1081  store volatile float %fneg, ptr addrspace(1) %out
1082  store volatile float %use1, ptr addrspace(1) %out
1083  ret void
1084}
1085
1086; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
1087; GCN-NOT: v0
1088; GCN-NOT: v1
1089; GCN: v_min_f32_e64 v0, -v0, -v1
1090; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
1091; GCN-NEXT: ; return
1092define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 {
1093  %max = call float @llvm.maxnum.f32(float %a, float %b)
1094  %fneg = fneg float %max
1095  %use1 = fmul float %max, 4.0
1096  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
1097  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
1098  ret <2 x float> %ins1
1099}
1100
1101; --------------------------------------------------------------------------------
1102; fma tests
1103; --------------------------------------------------------------------------------
1104
1105; GCN-LABEL: {{^}}v_fneg_fma_f32:
1106; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1107; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1108; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1109
1110; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
1111; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]]
1112
1113; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
1114; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1115define amdgpu_kernel void @v_fneg_fma_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1116  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1117  %tid.ext = sext i32 %tid to i64
1118  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1119  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1120  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1121  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1122  %a = load volatile float, ptr addrspace(1) %a.gep
1123  %b = load volatile float, ptr addrspace(1) %b.gep
1124  %c = load volatile float, ptr addrspace(1) %c.gep
1125  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1126  %fneg = fneg float %fma
1127  store float %fneg, ptr addrspace(1) %out.gep
1128  ret void
1129}
1130
1131; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
1132; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1133; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1134; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1135; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1136; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
1137; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1138; GCN-NEXT: s_waitcnt vmcnt(0)
1139; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1140; GCN-NEXT: s_waitcnt vmcnt(0)
1141define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1142  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1143  %tid.ext = sext i32 %tid to i64
1144  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1145  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1146  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1147  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1148  %a = load volatile float, ptr addrspace(1) %a.gep
1149  %b = load volatile float, ptr addrspace(1) %b.gep
1150  %c = load volatile float, ptr addrspace(1) %c.gep
1151  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1152  %fneg = fneg float %fma
1153  store volatile float %fneg, ptr addrspace(1) %out
1154  store volatile float %fma, ptr addrspace(1) %out
1155  ret void
1156}
1157
1158; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
1159; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1160; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1161; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1162
1163; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1164; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
1165; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
1166
1167; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
1168; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
1169
1170; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1171; GCN-NEXT: s_waitcnt vmcnt(0)
1172; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1173; GCN-NEXT: s_waitcnt vmcnt(0)
1174define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1175  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1176  %tid.ext = sext i32 %tid to i64
1177  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1178  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1179  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1180  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1181  %a = load volatile float, ptr addrspace(1) %a.gep
1182  %b = load volatile float, ptr addrspace(1) %b.gep
1183  %c = load volatile float, ptr addrspace(1) %c.gep
1184  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1185  %fneg = fneg float %fma
1186  %use1 = fmul float %fma, 4.0
1187  store volatile float %fneg, ptr addrspace(1) %out
1188  store volatile float %use1, ptr addrspace(1) %out
1189  ret void
1190}
1191
1192; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
1193; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1194; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1195; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1196
1197; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]]
1198; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1199
1200; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1201; GCN-NSZ-NOT: [[FMA]]
1202; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1203define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1204  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1205  %tid.ext = sext i32 %tid to i64
1206  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1207  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1208  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1209  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1210  %a = load volatile float, ptr addrspace(1) %a.gep
1211  %b = load volatile float, ptr addrspace(1) %b.gep
1212  %c = load volatile float, ptr addrspace(1) %c.gep
1213  %fneg.a = fneg float %a
1214  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1215  %fneg = fneg float %fma
1216  store volatile float %fneg, ptr addrspace(1) %out
1217  ret void
1218}
1219
1220; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
1221; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1222; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1223; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1224
1225; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
1226; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1227
1228; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1229; GCN-NSZ-NOT: [[FMA]]
1230; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1231define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1232  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1233  %tid.ext = sext i32 %tid to i64
1234  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1235  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1236  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1237  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1238  %a = load volatile float, ptr addrspace(1) %a.gep
1239  %b = load volatile float, ptr addrspace(1) %b.gep
1240  %c = load volatile float, ptr addrspace(1) %c.gep
1241  %fneg.b = fneg float %b
1242  %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
1243  %fneg = fneg float %fma
1244  store volatile float %fneg, ptr addrspace(1) %out
1245  ret void
1246}
1247
1248; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
1249; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1250; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1251; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1252
1253; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1254; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1255
1256; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
1257; GCN-NSZ-NOT: [[FMA]]
1258; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1259define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1260  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1261  %tid.ext = sext i32 %tid to i64
1262  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1263  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1264  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1265  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1266  %a = load volatile float, ptr addrspace(1) %a.gep
1267  %b = load volatile float, ptr addrspace(1) %b.gep
1268  %c = load volatile float, ptr addrspace(1) %c.gep
1269  %fneg.a = fneg float %a
1270  %fneg.b = fneg float %b
1271  %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
1272  %fneg = fneg float %fma
1273  store volatile float %fneg, ptr addrspace(1) %out
1274  ret void
1275}
1276
1277; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
1278; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1279; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1280; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1281
1282; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]]
1283; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1284
1285; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1286; GCN-NSZ-NOT: [[FMA]]
1287; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1288define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1289  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1290  %tid.ext = sext i32 %tid to i64
1291  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1292  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1293  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1294  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1295  %a = load volatile float, ptr addrspace(1) %a.gep
1296  %b = load volatile float, ptr addrspace(1) %b.gep
1297  %c = load volatile float, ptr addrspace(1) %c.gep
1298  %fneg.a = fneg float %a
1299  %fneg.c = fneg float %c
1300  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
1301  %fneg = fneg float %fma
1302  store volatile float %fneg, ptr addrspace(1) %out
1303  ret void
1304}
1305
1306; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
1307; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1308; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1309; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1310
1311; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1312; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1313
1314; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
1315; GCN-NSZ-NOT: [[FMA]]
1316; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1317define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1318  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1319  %tid.ext = sext i32 %tid to i64
1320  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1321  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1322  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1323  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1324  %a = load volatile float, ptr addrspace(1) %a.gep
1325  %b = load volatile float, ptr addrspace(1) %b.gep
1326  %c = load volatile float, ptr addrspace(1) %c.gep
1327  %fneg.c = fneg float %c
1328  %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
1329  %fneg = fneg float %fma
1330  store volatile float %fneg, ptr addrspace(1) %out
1331  ret void
1332}
1333
1334; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
1335; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1336; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1337; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1338
1339; GCN-SAFE: v_xor_b32
1340; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]],
1341; GCN-SAFE: v_xor_b32
1342
1343; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1344; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1345
1346; GCN-NSZ-NOT: [[FMA]]
1347; GCN-NSZ-NOT: [[NEG_A]]
1348; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1349; GCN-NSZ-NOT: [[NEG_A]]
1350; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1351define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1352  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1353  %tid.ext = sext i32 %tid to i64
1354  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1355  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1356  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1357  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1358  %a = load volatile float, ptr addrspace(1) %a.gep
1359  %b = load volatile float, ptr addrspace(1) %b.gep
1360  %c = load volatile float, ptr addrspace(1) %c.gep
1361  %fneg.a = fneg float %a
1362  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1363  %fneg = fneg float %fma
1364  store volatile float %fneg, ptr addrspace(1) %out
1365  store volatile float %fneg.a, ptr addrspace(1) %out
1366  ret void
1367}
1368
1369; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
1370; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1371; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1372; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1373
1374; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1375; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]]
1376; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1377
1378; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1379; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1380; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
1381; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1382; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
1383define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, float %d) #0 {
1384  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1385  %tid.ext = sext i32 %tid to i64
1386  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1387  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1388  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1389  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1390  %a = load volatile float, ptr addrspace(1) %a.gep
1391  %b = load volatile float, ptr addrspace(1) %b.gep
1392  %c = load volatile float, ptr addrspace(1) %c.gep
1393  %fneg.a = fneg float %a
1394  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1395  %fneg = fneg float %fma
1396  %use1 = fmul float %fneg.a, %d
1397  store volatile float %fneg, ptr addrspace(1) %out
1398  store volatile float %use1, ptr addrspace(1) %out
1399  ret void
1400}
1401
1402; --------------------------------------------------------------------------------
1403; fmad tests
1404; --------------------------------------------------------------------------------
1405
1406; GCN-LABEL: {{^}}v_fneg_fmad_f32:
1407; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1408; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1409; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1410
1411; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1412; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
1413
1414; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
1415; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1416define amdgpu_kernel void @v_fneg_fmad_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1417  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1418  %tid.ext = sext i32 %tid to i64
1419  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1420  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1421  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1422  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1423  %a = load volatile float, ptr addrspace(1) %a.gep
1424  %b = load volatile float, ptr addrspace(1) %b.gep
1425  %c = load volatile float, ptr addrspace(1) %c.gep
1426  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1427  %fneg = fneg float %fma
1428  store float %fneg, ptr addrspace(1) %out.gep
1429  ret void
1430}
1431
1432; GCN-LABEL: {{^}}v_fneg_fmad_v4f32:
1433
1434; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1435; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1436; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1437; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1438define amdgpu_kernel void @v_fneg_fmad_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1439  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1440  %tid.ext = sext i32 %tid to i64
1441  %a.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a.ptr, i64 %tid.ext
1442  %b.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %b.ptr, i64 %tid.ext
1443  %c.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %c.ptr, i64 %tid.ext
1444  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1445  %a = load volatile <4 x float>, ptr addrspace(1) %a.gep
1446  %b = load volatile <4 x float>, ptr addrspace(1) %b.gep
1447  %c = load volatile <4 x float>, ptr addrspace(1) %c.gep
1448  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
1449  %fneg = fneg <4 x float> %fma
1450  store <4 x float> %fneg, ptr addrspace(1) %out.gep
1451  ret void
1452}
1453
1454; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
1455; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1456; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1457; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1458
1459; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1460; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
1461; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
1462
1463; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]]
1464; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
1465
1466; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]]
1467; GCN-NEXT: s_waitcnt vmcnt(0)
1468; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1469; GCN-NEXT: s_waitcnt vmcnt(0)
1470define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
1471  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1472  %tid.ext = sext i32 %tid to i64
1473  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1474  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1475  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
1476  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1477  %a = load volatile float, ptr addrspace(1) %a.gep
1478  %b = load volatile float, ptr addrspace(1) %b.gep
1479  %c = load volatile float, ptr addrspace(1) %c.gep
1480  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1481  %fneg = fneg float %fma
1482  %use1 = fmul float %fma, 4.0
1483  store volatile float %fneg, ptr addrspace(1) %out
1484  store volatile float %use1, ptr addrspace(1) %out
1485  ret void
1486}
1487
1488; --------------------------------------------------------------------------------
1489; fp_extend tests
1490; --------------------------------------------------------------------------------
1491
1492; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
1493; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1494; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
1495; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1496define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1497  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1498  %tid.ext = sext i32 %tid to i64
1499  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1500  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
1501  %a = load volatile float, ptr addrspace(1) %a.gep
1502  %fpext = fpext float %a to double
1503  %fneg = fsub double -0.000000e+00, %fpext
1504  store double %fneg, ptr addrspace(1) %out.gep
1505  ret void
1506}
1507
1508; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
1509; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1510; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1511; GCN: {{buffer|flat}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1512define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1513  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1514  %tid.ext = sext i32 %tid to i64
1515  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1516  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
1517  %a = load volatile float, ptr addrspace(1) %a.gep
1518  %fneg.a = fneg float %a
1519  %fpext = fpext float %fneg.a to double
1520  %fneg = fsub double -0.000000e+00, %fpext
1521  store double %fneg, ptr addrspace(1) %out.gep
1522  ret void
1523}
1524
1525; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
1526; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1527; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1528; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
1529; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1530; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FNEG_A]]
1531define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1532  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1533  %tid.ext = sext i32 %tid to i64
1534  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1535  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
1536  %a = load volatile float, ptr addrspace(1) %a.gep
1537  %fneg.a = fneg float %a
1538  %fpext = fpext float %fneg.a to double
1539  %fneg = fsub double -0.000000e+00, %fpext
1540  store volatile double %fneg, ptr addrspace(1) %out.gep
1541  store volatile float %fneg.a, ptr addrspace(1) undef
1542  ret void
1543}
1544
1545; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
1546; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1547; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]]
1548; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1549; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]]
1550; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[CVT_LO]]:[[CVT_HI]]]
1551define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1552  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1553  %tid.ext = sext i32 %tid to i64
1554  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1555  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
1556  %a = load volatile float, ptr addrspace(1) %a.gep
1557  %fpext = fpext float %a to double
1558  %fneg = fsub double -0.000000e+00, %fpext
1559  store volatile double %fneg, ptr addrspace(1) %out.gep
1560  store volatile double %fpext, ptr addrspace(1) undef
1561  ret void
1562}
1563
1564; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
1565; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1566; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]]
1567; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1568; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v[[[CVT_LO]]:[[CVT_HI]]], 4.0
1569; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]]
1570; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1571define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1572  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1573  %tid.ext = sext i32 %tid to i64
1574  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1575  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
1576  %a = load volatile float, ptr addrspace(1) %a.gep
1577  %fpext = fpext float %a to double
1578  %fneg = fsub double -0.000000e+00, %fpext
1579  %mul = fmul double %fpext, 4.0
1580  store volatile double %fneg, ptr addrspace(1) %out.gep
1581  store volatile double %mul, ptr addrspace(1) %out.gep
1582  ret void
1583}
1584
1585; FIXME: Source modifiers not folded for f16->f32
1586; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
1587define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1588  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1589  %tid.ext = sext i32 %tid to i64
1590  %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
1591  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1592  %a = load volatile half, ptr addrspace(1) %a.gep
1593  %fpext = fpext half %a to float
1594  %fneg = fneg float %fpext
1595  store volatile float %fneg, ptr addrspace(1) %out.gep
1596  store volatile float %fpext, ptr addrspace(1) %out.gep
1597  ret void
1598}
1599
1600; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
1601define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1602  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1603  %tid.ext = sext i32 %tid to i64
1604  %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext
1605  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1606  %a = load volatile half, ptr addrspace(1) %a.gep
1607  %fpext = fpext half %a to float
1608  %fneg = fneg float %fpext
1609  %mul = fmul float %fpext, 4.0
1610  store volatile float %fneg, ptr addrspace(1) %out.gep
1611  store volatile float %mul, ptr addrspace(1) %out.gep
1612  ret void
1613}
1614
1615; --------------------------------------------------------------------------------
1616; fp_round tests
1617; --------------------------------------------------------------------------------
1618
1619; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
1620; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1621; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
1622; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1623define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1624  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1625  %tid.ext = sext i32 %tid to i64
1626  %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
1627  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1628  %a = load volatile double, ptr addrspace(1) %a.gep
1629  %fpround = fptrunc double %a to float
1630  %fneg = fneg float %fpround
1631  store float %fneg, ptr addrspace(1) %out.gep
1632  ret void
1633}
1634
1635; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
1636; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1637; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1638; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1639define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1640  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1641  %tid.ext = sext i32 %tid to i64
1642  %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
1643  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1644  %a = load volatile double, ptr addrspace(1) %a.gep
1645  %fneg.a = fsub double -0.000000e+00, %a
1646  %fpround = fptrunc double %fneg.a to float
1647  %fneg = fneg float %fpround
1648  store float %fneg, ptr addrspace(1) %out.gep
1649  ret void
1650}
1651
1652; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
1653; GCN: {{buffer|flat}}_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]]
1654; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v[[[A_LO]]:[[A_HI]]]
1655; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
1656; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1657; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[A_LO]]:[[NEG_A_HI]]]
1658define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1659  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1660  %tid.ext = sext i32 %tid to i64
1661  %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
1662  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1663  %a = load volatile double, ptr addrspace(1) %a.gep
1664  %fneg.a = fsub double -0.000000e+00, %a
1665  %fpround = fptrunc double %fneg.a to float
1666  %fneg = fneg float %fpround
1667  store volatile float %fneg, ptr addrspace(1) %out.gep
1668  store volatile double %fneg.a, ptr addrspace(1) undef
1669  ret void
1670}
1671
1672; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
1673; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1674; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1675; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s[
1676
1677; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1678; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[USE1]]
1679define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, double %c) #0 {
1680  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1681  %tid.ext = sext i32 %tid to i64
1682  %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
1683  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1684  %a = load volatile double, ptr addrspace(1) %a.gep
1685  %fneg.a = fsub double -0.000000e+00, %a
1686  %fpround = fptrunc double %fneg.a to float
1687  %fneg = fneg float %fpround
1688  %use1 = fmul double %fneg.a, %c
1689  store volatile float %fneg, ptr addrspace(1) %out.gep
1690  store volatile double %use1, ptr addrspace(1) undef
1691  ret void
1692}
1693
1694; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
1695; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1696; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1697; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1698define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1699  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1700  %tid.ext = sext i32 %tid to i64
1701  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1702  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
1703  %a = load volatile float, ptr addrspace(1) %a.gep
1704  %fpround = fptrunc float %a to half
1705  %fneg = fsub half -0.000000e+00, %fpround
1706  store half %fneg, ptr addrspace(1) %out.gep
1707  ret void
1708}
1709
1710; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
1711; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1712; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1713; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1714define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1715  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1716  %tid.ext = sext i32 %tid to i64
1717  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1718  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
1719  %a = load volatile float, ptr addrspace(1) %a.gep
1720  %fneg.a = fneg float %a
1721  %fpround = fptrunc float %fneg.a to half
1722  %fneg = fsub half -0.000000e+00, %fpround
1723  store half %fneg, ptr addrspace(1) %out.gep
1724  ret void
1725}
1726
1727; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
1728; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1729; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
1730; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
1731; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG]]
1732; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[CVT]]
1733define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1734  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1735  %tid.ext = sext i32 %tid to i64
1736  %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
1737  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1738  %a = load volatile double, ptr addrspace(1) %a.gep
1739  %fpround = fptrunc double %a to float
1740  %fneg = fneg float %fpround
1741  store volatile float %fneg, ptr addrspace(1) %out.gep
1742  store volatile float %fpround, ptr addrspace(1) %out.gep
1743  ret void
1744}
1745
1746; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
1747; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1748; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1749; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1750; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1751; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1752define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1753  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1754  %tid.ext = sext i32 %tid to i64
1755  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1756  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
1757  %a = load volatile float, ptr addrspace(1) %a.gep
1758  %fneg.a = fneg float %a
1759  %fpround = fptrunc float %fneg.a to half
1760  %fneg = fsub half -0.000000e+00, %fpround
1761  store volatile half %fneg, ptr addrspace(1) %out.gep
1762  store volatile float %fneg.a, ptr addrspace(1) undef
1763  ret void
1764}
1765
1766; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
1767; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1768; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1769; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
1770; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1771; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[USE1]]
1772define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, float %c) #0 {
1773  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1774  %tid.ext = sext i32 %tid to i64
1775  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1776  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
1777  %a = load volatile float, ptr addrspace(1) %a.gep
1778  %fneg.a = fneg float %a
1779  %fpround = fptrunc float %fneg.a to half
1780  %fneg = fsub half -0.000000e+00, %fpround
1781  %use1 = fmul float %fneg.a, %c
1782  store volatile half %fneg, ptr addrspace(1) %out.gep
1783  store volatile float %use1, ptr addrspace(1) undef
1784  ret void
1785}
1786
1787; --------------------------------------------------------------------------------
1788; rcp tests
1789; --------------------------------------------------------------------------------
1790
1791; GCN-LABEL: {{^}}v_fneg_rcp_f32:
1792; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1793; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1794; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1795define amdgpu_kernel void @v_fneg_rcp_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1796  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1797  %tid.ext = sext i32 %tid to i64
1798  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1799  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1800  %a = load volatile float, ptr addrspace(1) %a.gep
1801  %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
1802  %fneg = fneg float %rcp
1803  store float %fneg, ptr addrspace(1) %out.gep
1804  ret void
1805}
1806
1807; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
1808; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1809; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1810; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1811define amdgpu_kernel void @v_fneg_rcp_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1812  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1813  %tid.ext = sext i32 %tid to i64
1814  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1815  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1816  %a = load volatile float, ptr addrspace(1) %a.gep
1817  %fneg.a = fneg float %a
1818  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1819  %fneg = fneg float %rcp
1820  store float %fneg, ptr addrspace(1) %out.gep
1821  ret void
1822}
1823
1824; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
1825; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1826; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1827; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1828; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1829; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1830define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
1831  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1832  %tid.ext = sext i32 %tid to i64
1833  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1834  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1835  %a = load volatile float, ptr addrspace(1) %a.gep
1836  %fneg.a = fneg float %a
1837  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1838  %fneg = fneg float %rcp
1839  store volatile float %fneg, ptr addrspace(1) %out.gep
1840  store volatile float %fneg.a, ptr addrspace(1) undef
1841  ret void
1842}
1843
1844; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
1845; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1846; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1847; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1848; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1849; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1850define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, float %c) #0 {
1851  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1852  %tid.ext = sext i32 %tid to i64
1853  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1854  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1855  %a = load volatile float, ptr addrspace(1) %a.gep
1856  %fneg.a = fneg float %a
1857  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1858  %fneg = fneg float %rcp
1859  %use1 = fmul float %fneg.a, %c
1860  store volatile float %fneg, ptr addrspace(1) %out.gep
1861  store volatile float %use1, ptr addrspace(1) undef
1862  ret void
1863}
1864
1865; --------------------------------------------------------------------------------
1866; fmul_legacy tests
1867; --------------------------------------------------------------------------------
1868
1869; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
1870; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1871; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1872; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
1873; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1874define amdgpu_kernel void @v_fneg_mul_legacy_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1875  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1876  %tid.ext = sext i32 %tid to i64
1877  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1878  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1879  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1880  %a = load volatile float, ptr addrspace(1) %a.gep
1881  %b = load volatile float, ptr addrspace(1) %b.gep
1882  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1883  %fneg = fneg float %mul
1884  store float %fneg, ptr addrspace(1) %out.gep
1885  ret void
1886}
1887
1888; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
1889; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1890; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1891; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1892; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
1893; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
1894; GCN-NEXT: s_waitcnt vmcnt(0)
1895; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1896; GCN-NEXT: s_waitcnt vmcnt(0)
1897define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1898  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1899  %tid.ext = sext i32 %tid to i64
1900  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1901  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1902  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1903  %a = load volatile float, ptr addrspace(1) %a.gep
1904  %b = load volatile float, ptr addrspace(1) %b.gep
1905  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1906  %fneg = fneg float %mul
1907  store volatile float %fneg, ptr addrspace(1) %out
1908  store volatile float %mul, ptr addrspace(1) %out
1909  ret void
1910}
1911
1912; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
1913; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1914; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1915; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1916; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
1917; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1918; GCN-NEXT: s_waitcnt vmcnt(0)
1919; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1920; GCN-NEXT: s_waitcnt vmcnt(0)
1921define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1922  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1923  %tid.ext = sext i32 %tid to i64
1924  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1925  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1926  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1927  %a = load volatile float, ptr addrspace(1) %a.gep
1928  %b = load volatile float, ptr addrspace(1) %b.gep
1929  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1930  %fneg = fneg float %mul
1931  %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
1932  store volatile float %fneg, ptr addrspace(1) %out
1933  store volatile float %use1, ptr addrspace(1) %out
1934  ret void
1935}
1936
1937; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
1938; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1939; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1940; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1941; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1942define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1943  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1944  %tid.ext = sext i32 %tid to i64
1945  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1946  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1947  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1948  %a = load volatile float, ptr addrspace(1) %a.gep
1949  %b = load volatile float, ptr addrspace(1) %b.gep
1950  %fneg.a = fneg float %a
1951  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1952  %fneg = fneg float %mul
1953  store volatile float %fneg, ptr addrspace(1) %out
1954  ret void
1955}
1956
1957; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
1958; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1959; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1960; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1961; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1962define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1963  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1964  %tid.ext = sext i32 %tid to i64
1965  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1966  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1967  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1968  %a = load volatile float, ptr addrspace(1) %a.gep
1969  %b = load volatile float, ptr addrspace(1) %b.gep
1970  %fneg.b = fneg float %b
1971  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
1972  %fneg = fneg float %mul
1973  store volatile float %fneg, ptr addrspace(1) %out
1974  ret void
1975}
1976
1977; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
1978; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1979; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1980; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1981; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1982define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1983  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1984  %tid.ext = sext i32 %tid to i64
1985  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
1986  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
1987  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1988  %a = load volatile float, ptr addrspace(1) %a.gep
1989  %b = load volatile float, ptr addrspace(1) %b.gep
1990  %fneg.a = fneg float %a
1991  %fneg.b = fneg float %b
1992  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
1993  %fneg = fneg float %mul
1994  store volatile float %fneg, ptr addrspace(1) %out
1995  ret void
1996}
1997
1998; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
1999; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2000; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2001; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
2002; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
2003; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
2004; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
2005define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2006  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2007  %tid.ext = sext i32 %tid to i64
2008  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2009  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2010  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2011  %a = load volatile float, ptr addrspace(1) %a.gep
2012  %b = load volatile float, ptr addrspace(1) %b.gep
2013  %fneg.a = fneg float %a
2014  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
2015  %fneg = fneg float %mul
2016  store volatile float %fneg, ptr addrspace(1) %out
2017  store volatile float %fneg.a, ptr addrspace(1) %out
2018  ret void
2019}
2020
2021; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
2022; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2023; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2024; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
2025; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
2026; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
2027; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2028define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 {
2029  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2030  %tid.ext = sext i32 %tid to i64
2031  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2032  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2033  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2034  %a = load volatile float, ptr addrspace(1) %a.gep
2035  %b = load volatile float, ptr addrspace(1) %b.gep
2036  %fneg.a = fneg float %a
2037  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
2038  %fneg = fneg float %mul
2039  %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
2040  store volatile float %fneg, ptr addrspace(1) %out
2041  store volatile float %use1, ptr addrspace(1) %out
2042  ret void
2043}
2044
2045; --------------------------------------------------------------------------------
2046; sin tests
2047; --------------------------------------------------------------------------------
2048
2049; GCN-LABEL: {{^}}v_fneg_sin_f32:
2050; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2051; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]]
2052; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
2053; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
2054; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2055define amdgpu_kernel void @v_fneg_sin_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
2056  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2057  %tid.ext = sext i32 %tid to i64
2058  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2059  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2060  %a = load volatile float, ptr addrspace(1) %a.gep
2061  %sin = call float @llvm.sin.f32(float %a)
2062  %fneg = fneg float %sin
2063  store float %fneg, ptr addrspace(1) %out.gep
2064  ret void
2065}
2066
2067; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
2068; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2069; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2070; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2071define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
2072  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2073  %tid.ext = sext i32 %tid to i64
2074  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2075  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2076  %a = load volatile float, ptr addrspace(1) %a.gep
2077  %sin = call float @llvm.amdgcn.sin.f32(float %a)
2078  %fneg = fneg float %sin
2079  store float %fneg, ptr addrspace(1) %out.gep
2080  ret void
2081}
2082
2083; --------------------------------------------------------------------------------
2084; ftrunc tests
2085; --------------------------------------------------------------------------------
2086
2087; GCN-LABEL: {{^}}v_fneg_trunc_f32:
2088; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2089; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2090; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2091define amdgpu_kernel void @v_fneg_trunc_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
2092  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2093  %tid.ext = sext i32 %tid to i64
2094  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2095  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2096  %a = load volatile float, ptr addrspace(1) %a.gep
2097  %trunc = call float @llvm.trunc.f32(float %a)
2098  %fneg = fneg float %trunc
2099  store float %fneg, ptr addrspace(1) %out.gep
2100  ret void
2101}
2102
2103; --------------------------------------------------------------------------------
2104; fround tests
2105; --------------------------------------------------------------------------------
2106
2107; GCN-LABEL: {{^}}v_fneg_round_f32:
2108; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2109; GCN: v_trunc_f32_e32
2110; GCN: v_sub_f32_e32
2111; GCN: v_cndmask_b32
2112
2113; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
2114; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
2115
2116; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
2117; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2118define amdgpu_kernel void @v_fneg_round_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
2119  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2120  %tid.ext = sext i32 %tid to i64
2121  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2122  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2123  %a = load volatile float, ptr addrspace(1) %a.gep
2124  %round = call float @llvm.round.f32(float %a)
2125  %fneg = fneg float %round
2126  store float %fneg, ptr addrspace(1) %out.gep
2127  ret void
2128}
2129
2130; --------------------------------------------------------------------------------
2131; rint tests
2132; --------------------------------------------------------------------------------
2133
2134; GCN-LABEL: {{^}}v_fneg_rint_f32:
2135; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2136; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2137; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2138define amdgpu_kernel void @v_fneg_rint_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
2139  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2140  %tid.ext = sext i32 %tid to i64
2141  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2142  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2143  %a = load volatile float, ptr addrspace(1) %a.gep
2144  %rint = call float @llvm.rint.f32(float %a)
2145  %fneg = fneg float %rint
2146  store float %fneg, ptr addrspace(1) %out.gep
2147  ret void
2148}
2149
2150; --------------------------------------------------------------------------------
2151; nearbyint tests
2152; --------------------------------------------------------------------------------
2153
2154; GCN-LABEL: {{^}}v_fneg_nearbyint_f32:
2155; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2156; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2157; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2158define amdgpu_kernel void @v_fneg_nearbyint_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
2159  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2160  %tid.ext = sext i32 %tid to i64
2161  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2162  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2163  %a = load volatile float, ptr addrspace(1) %a.gep
2164  %nearbyint = call float @llvm.nearbyint.f32(float %a)
2165  %fneg = fneg float %nearbyint
2166  store float %fneg, ptr addrspace(1) %out.gep
2167  ret void
2168}
2169
2170; --------------------------------------------------------------------------------
2171; fcanonicalize tests
2172; --------------------------------------------------------------------------------
2173
2174; GCN-LABEL: {{^}}v_fneg_canonicalize_f32:
2175; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2176; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]]
2177; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2178define amdgpu_kernel void @v_fneg_canonicalize_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
2179  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2180  %tid.ext = sext i32 %tid to i64
2181  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2182  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2183  %a = load volatile float, ptr addrspace(1) %a.gep
2184  %trunc = call float @llvm.canonicalize.f32(float %a)
2185  %fneg = fneg float %trunc
2186  store float %fneg, ptr addrspace(1) %out.gep
2187  ret void
2188}
2189
2190; --------------------------------------------------------------------------------
2191; vintrp tests
2192; --------------------------------------------------------------------------------
2193
2194; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
2195; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2196; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2197; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2198; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2199; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2200define amdgpu_kernel void @v_fneg_interp_p1_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2201  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2202  %tid.ext = sext i32 %tid to i64
2203  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2204  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2205  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2206  %a = load volatile float, ptr addrspace(1) %a.gep
2207  %b = load volatile float, ptr addrspace(1) %b.gep
2208  %mul = fmul float %a, %b
2209  %fneg = fneg float %mul
2210  %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
2211  %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
2212  store volatile float %intrp0, ptr addrspace(1) %out.gep
2213  store volatile float %intrp1, ptr addrspace(1) %out.gep
2214  ret void
2215}
2216
2217; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
2218; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2219; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2220; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2221; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2222; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2223define amdgpu_kernel void @v_fneg_interp_p2_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2224  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2225  %tid.ext = sext i32 %tid to i64
2226  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2227  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2228  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2229  %a = load volatile float, ptr addrspace(1) %a.gep
2230  %b = load volatile float, ptr addrspace(1) %b.gep
2231  %mul = fmul float %a, %b
2232  %fneg = fneg float %mul
2233  %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
2234  %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
2235  store volatile float %intrp0, ptr addrspace(1) %out.gep
2236  store volatile float %intrp1, ptr addrspace(1) %out.gep
2237  ret void
2238}
2239
2240; --------------------------------------------------------------------------------
2241; CopyToReg tests
2242; --------------------------------------------------------------------------------
2243
2244; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
2245; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2246; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2247; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2248; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
2249; GCN: s_cbranch_scc0
2250
2251; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2252; GCN: s_endpgm
2253
2254; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
2255; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
2256; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2257
2258define amdgpu_kernel void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, i32 %d) #0 {
2259  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2260  %tid.ext = sext i32 %tid to i64
2261  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2262  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2263  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
2264  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2265  %a = load volatile float, ptr addrspace(1) %a.gep
2266  %b = load volatile float, ptr addrspace(1) %b.gep
2267  %c = load volatile float, ptr addrspace(1) %c.gep
2268  %mul = fmul float %a, %b
2269  %fneg = fneg float %mul
2270  %cmp0 = icmp eq i32 %d, 0
2271  br i1 %cmp0, label %if, label %endif
2272
2273if:
2274  %mul1 = fmul float %fneg, %c
2275  store volatile float %mul1, ptr addrspace(1) %out.gep
2276  br label %endif
2277
2278endif:
2279  store volatile float %mul, ptr addrspace(1) %out.gep
2280  ret void
2281}
2282
2283; --------------------------------------------------------------------------------
2284; inlineasm tests
2285; --------------------------------------------------------------------------------
2286
2287; Can't fold into use, so should fold into source
2288; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
2289; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2290; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2291; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2292; GCN: ; use [[MUL]]
2293; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2294define amdgpu_kernel void @v_fneg_inlineasm_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, i32 %d) #0 {
2295  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2296  %tid.ext = sext i32 %tid to i64
2297  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2298  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2299  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
2300  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2301  %a = load volatile float, ptr addrspace(1) %a.gep
2302  %b = load volatile float, ptr addrspace(1) %b.gep
2303  %c = load volatile float, ptr addrspace(1) %c.gep
2304  %mul = fmul float %a, %b
2305  %fneg = fneg float %mul
2306  call void asm sideeffect "; use $0", "v"(float %fneg) #0
2307  store volatile float %fneg, ptr addrspace(1) %out.gep
2308  ret void
2309}
2310
2311; --------------------------------------------------------------------------------
2312; inlineasm tests
2313; --------------------------------------------------------------------------------
2314
2315; Can't fold into use, so should fold into source
2316; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
2317; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2318; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2319; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
2320; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
2321; GCN: ; use [[NEG]]
2322; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2323define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, i32 %d) #0 {
2324  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2325  %tid.ext = sext i32 %tid to i64
2326  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2327  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2328  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
2329  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2330  %a = load volatile float, ptr addrspace(1) %a.gep
2331  %b = load volatile float, ptr addrspace(1) %b.gep
2332  %c = load volatile float, ptr addrspace(1) %c.gep
2333  %mul = fmul float %a, %b
2334  %fneg = fneg float %mul
2335  call void asm sideeffect "; use $0", "v"(float %fneg) #0
2336  store volatile float %mul, ptr addrspace(1) %out.gep
2337  ret void
2338}
2339
2340; --------------------------------------------------------------------------------
2341; code size regression tests
2342; --------------------------------------------------------------------------------
2343
2344; There are multiple users of the fneg that must use a VOP3
2345; instruction, so there is no penalty
2346; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
2347; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2348; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2349; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2350
2351; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
2352; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
2353
2354; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2355; GCN-NEXT: s_waitcnt vmcnt(0)
2356; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]]
2357; GCN-NEXT: s_waitcnt vmcnt(0)
2358define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
2359  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2360  %tid.ext = sext i32 %tid to i64
2361  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2362  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2363  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
2364  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2365  %a = load volatile float, ptr addrspace(1) %a.gep
2366  %b = load volatile float, ptr addrspace(1) %b.gep
2367  %c = load volatile float, ptr addrspace(1) %c.gep
2368
2369  %fneg.a = fneg float %a
2370  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
2371  %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
2372
2373  store volatile float %fma0, ptr addrspace(1) %out
2374  store volatile float %fma1, ptr addrspace(1) %out
2375  ret void
2376}
2377
2378; There are multiple users, but both require using a larger encoding
2379; for the modifier.
2380
2381; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
2382; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2383; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2384; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2385
2386; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
2387; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
2388; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2389; GCN-NEXT: s_waitcnt vmcnt(0)
2390; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2391; GCN-NEXT: s_waitcnt vmcnt(0)
2392define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
2393  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2394  %tid.ext = sext i32 %tid to i64
2395  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2396  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2397  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
2398  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2399  %a = load volatile float, ptr addrspace(1) %a.gep
2400  %b = load volatile float, ptr addrspace(1) %b.gep
2401  %c = load volatile float, ptr addrspace(1) %c.gep
2402
2403  %fneg.a = fneg float %a
2404  %mul0 = fmul float %fneg.a, %b
2405  %mul1 = fmul float %fneg.a, %c
2406
2407  store volatile float %mul0, ptr addrspace(1) %out
2408  store volatile float %mul1, ptr addrspace(1) %out
2409  ret void
2410}
2411
2412; One user is VOP3 so has no cost to folding the modifier, the other does.
2413; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
2414; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2415; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2416; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2417
2418; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
2419; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
2420
2421; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2422; GCN-NEXT: s_waitcnt vmcnt(0)
2423; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2424; GCN-NEXT: s_waitcnt vmcnt(0)
2425define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 {
2426  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2427  %tid.ext = sext i32 %tid to i64
2428  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2429  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2430  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
2431  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2432  %a = load volatile float, ptr addrspace(1) %a.gep
2433  %b = load volatile float, ptr addrspace(1) %b.gep
2434  %c = load volatile float, ptr addrspace(1) %c.gep
2435
2436  %fneg.a = fneg float %a
2437  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
2438  %mul1 = fmul float %fneg.a, %c
2439
2440  store volatile float %fma0, ptr addrspace(1) %out
2441  store volatile float %mul1, ptr addrspace(1) %out
2442  ret void
2443}
2444
2445; The use of the fneg requires a code size increase, but folding into
2446; the source does not
2447
2448; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
2449; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2450; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2451; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2452; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2453
2454; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
2455; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
2456; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
2457
2458; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
2459; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
2460; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
2461
2462; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2463; GCN-NEXT: s_waitcnt vmcnt(0)
2464; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]]
2465; GCN-NEXT: s_waitcnt vmcnt(0)
2466define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 {
2467  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2468  %tid.ext = sext i32 %tid to i64
2469  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2470  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2471  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
2472  %d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext
2473  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2474  %a = load volatile float, ptr addrspace(1) %a.gep
2475  %b = load volatile float, ptr addrspace(1) %b.gep
2476  %c = load volatile float, ptr addrspace(1) %c.gep
2477  %d = load volatile float, ptr addrspace(1) %d.gep
2478
2479  %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
2480  %fneg.fma0 = fneg float %fma0
2481  %mul1 = fmul float %fneg.fma0, %c
2482  %mul2 = fmul float %fneg.fma0, %d
2483
2484  store volatile float %mul1, ptr addrspace(1) %out
2485  store volatile float %mul2, ptr addrspace(1) %out
2486  ret void
2487}
2488
2489; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
2490; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
2491; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
2492; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
2493; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
2494
2495; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
2496; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
2497; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
2498
2499; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2500; GCN-NEXT: s_waitcnt vmcnt(0)
2501; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2502; GCN-NEXT: s_waitcnt vmcnt(0)
2503define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 {
2504  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2505  %tid.ext = sext i32 %tid to i64
2506  %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
2507  %b.gep = getelementptr inbounds double, ptr addrspace(1) %b.ptr, i64 %tid.ext
2508  %c.gep = getelementptr inbounds double, ptr addrspace(1) %c.ptr, i64 %tid.ext
2509  %d.gep = getelementptr inbounds double, ptr addrspace(1) %d.ptr, i64 %tid.ext
2510  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
2511  %a = load volatile double, ptr addrspace(1) %a.gep
2512  %b = load volatile double, ptr addrspace(1) %b.gep
2513  %c = load volatile double, ptr addrspace(1) %c.gep
2514  %d = load volatile double, ptr addrspace(1) %d.gep
2515
2516  %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
2517  %fneg.fma0 = fsub double -0.0, %fma0
2518  %mul1 = fmul double %fneg.fma0, %c
2519  %mul2 = fmul double %fneg.fma0, %d
2520
2521  store volatile double %mul1, ptr addrspace(1) %out
2522  store volatile double %mul2, ptr addrspace(1) %out
2523  ret void
2524}
2525
2526; %trunc.a has one fneg use, but it requires a code size increase and
2527; %the fneg can instead be folded for free into the fma.
2528
2529; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
2530; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2531; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2532; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2533; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2534; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2535; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2536define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 {
2537  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2538  %tid.ext = sext i32 %tid to i64
2539  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2540  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2541  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
2542  %d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext
2543  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2544  %a = load volatile float, ptr addrspace(1) %a.gep
2545  %b = load volatile float, ptr addrspace(1) %b.gep
2546  %c = load volatile float, ptr addrspace(1) %c.gep
2547  %d = load volatile float, ptr addrspace(1) %d.gep
2548
2549  %trunc.a = call float @llvm.trunc.f32(float %a)
2550  %trunc.fneg.a = fneg float %trunc.a
2551  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2552  store volatile float %fma0, ptr addrspace(1) %out
2553  ret void
2554}
2555
2556; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
2557; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2558; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2559; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2560; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2561; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2562; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2563; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
2564; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2565; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2566define amdgpu_kernel void @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 {
2567  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2568  %tid.ext = sext i32 %tid to i64
2569  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2570  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
2571  %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext
2572  %d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext
2573  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2574  %a = load volatile float, ptr addrspace(1) %a.gep
2575  %b = load volatile float, ptr addrspace(1) %b.gep
2576  %c = load volatile float, ptr addrspace(1) %c.gep
2577  %d = load volatile float, ptr addrspace(1) %d.gep
2578
2579  %trunc.a = call float @llvm.trunc.f32(float %a)
2580  %trunc.fneg.a = fneg float %trunc.a
2581  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2582  %mul1 = fmul float %trunc.a, %d
2583  store volatile float %fma0, ptr addrspace(1) %out
2584  store volatile float %mul1, ptr addrspace(1) %out
2585  ret void
2586}
2587
2588; The AMDGPU combine to pull fneg into the FMA operands was being
2589; undone by the generic combine to pull the fneg out of the fma if
2590; !isFNegFree. We were reporting false for v2f32 even though it will
2591; be split into f32 where it will be free.
2592; GCN-LABEL: {{^}}fneg_fma_fneg_dagcombine_loop:
2593; GCN: s_brev_b32 [[NEGZERO:s[0-9]+]], 1{{$}}
2594; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], v2, -v4, [[NEGZERO]]
2595; GCN-DAG: v_fma_f32 [[FMA1:v[0-9]+]], v3, -v5, [[NEGZERO]]
2596; GCN-DAG: v_sub_f32_e32 [[SUB0:v[0-9]+]], [[FMA0]], v0
2597; GCN-DAG: v_sub_f32_e32 [[SUB1:v[0-9]+]], [[FMA1]], v1
2598; GCN-DAG: v_mul_f32_e32 v0, [[SUB0]], v4
2599; GCN-DAG: v_mul_f32_e32 v1, [[SUB1]], v5
2600; GCN: s_setpc_b64
2601define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 {
2602bb:
2603  %i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer)
2604  %i4 = fadd fast <2 x float> %i3, %arg
2605  %i5 = fneg <2 x float> %i4
2606  %i6 = fmul fast <2 x float> %i5, %arg2
2607  ret <2 x float> %i6
2608}
2609
2610; This expects denormal flushing, so can't turn this fmul into fneg
2611; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg:
2612; GCN: s_waitcnt
2613; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
2614define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 {
2615  %mul = fmul float %x, -1.0
2616  %add = fmul nnan float %mul, %y
2617  ret float %add
2618}
2619
2620; It's legal to turn this fmul into an fneg since denormals are
2621; preserved and we know an snan can't happen from the flag.
2622; GCN-LABEL: {{^}}denormal_fmul_neg1_to_fneg:
2623; GCN: v_mul_f32_e64 v0, -v0, v1
2624; GCN-NEXT: s_setpc_b64
2625define float @denormal_fmul_neg1_to_fneg(float %x, float %y) {
2626  %mul = fmul nnan float %x, -1.0
2627  %add = fmul float %mul, %y
2628  ret float %add
2629}
2630
2631; know the source can't be an snan
2632; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg:
2633; GCN: s_waitcnt
2634; GCN-NEXT: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0
2635; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1
2636; GCN-NEXT: s_setpc_b64
2637define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) {
2638  %canonical = fmul float %x, %x
2639  %mul = fmul float %canonical, -1.0
2640  %add = fmul float %mul, %y
2641  ret float %add
2642}
2643
2644; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg:
2645; GCN: s_waitcnt
2646; GCN-NEXT: v_mul_f32_e32 [[TMP:v[0-9]+]], 1.0, v0
2647; GCN-NEXT: v_mul_f32_e64 v0, -[[TMP]], v1
2648define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 {
2649  %quiet = call float @llvm.canonicalize.f32(float %x)
2650  %mul = fmul float %quiet, -1.0
2651  %add = fmul float %mul, %y
2652  ret float %add
2653}
2654
2655; GCN-LABEL: {{^}}fadd_select_fneg_fneg_f32:
2656; GCN: v_cmp_eq_u32_e32 vcc, 0, v0
2657; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
2658; GCN-NEXT: v_sub_f32_e32 v0, v3, v0
2659; GCN-NEXT: s_setpc_b64
2660define float @fadd_select_fneg_fneg_f32(i32 %arg0, float %x, float %y, float %z) {
2661  %cmp = icmp eq i32 %arg0, 0
2662  %neg.x = fneg float %x
2663  %neg.y  = fneg float %y
2664  %select = select i1 %cmp, float %neg.x, float %neg.y
2665  %add = fadd float %select, %z
2666  ret float %add
2667}
2668
2669; GCN-LABEL: {{^}}fadd_select_fneg_fneg_f64:
2670; GCN: v_cmp_eq_u32_e32 vcc, 0, v0
2671; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
2672; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
2673; GCN-NEXT: v_add_f64 v[0:1], v[5:6], -v[1:2]
2674; GCN-NEXT: s_setpc_b64
2675define double @fadd_select_fneg_fneg_f64(i32 %arg0, double %x, double %y, double %z) {
2676  %cmp = icmp eq i32 %arg0, 0
2677  %neg.x = fneg double %x
2678  %neg.y  = fneg double %y
2679  %select = select i1 %cmp, double %neg.x, double %neg.y
2680  %add = fadd double %select, %z
2681  ret double %add
2682}
2683
2684; GCN-LABEL: {{^}}fadd_select_fneg_fneg_f16:
2685; SI: v_cvt_f16_f32
2686; SI: v_cvt_f16_f32
2687; SI: v_cvt_f16_f32
2688; SI: v_cmp_eq_u32
2689; SI: v_cvt_f32_f16
2690; SI: v_cvt_f32_f16
2691; SI: v_cvt_f32_f16
2692; SI: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
2693; SI-NEXT: v_sub_f32_e32
2694; SI-NEXT: s_setpc_b64
2695
2696; VI: v_cmp_eq_u32_e32 vcc, 0, v0
2697; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
2698; VI-NEXT: v_sub_f16_e32 v0, v3, v0
2699; VI-NEXT: s_setpc_b64
2700define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
2701  %cmp = icmp eq i32 %arg0, 0
2702  %neg.x = fneg half %x
2703  %neg.y = fneg half %y
2704  %select = select i1 %cmp, half %neg.x, half %neg.y
2705  %add = fadd half %select, %z
2706  ret half %add
2707}
2708
2709; FIXME: Terrible code for SI
2710; GCN-LABEL: {{^}}fadd_select_fneg_fneg_v2f16:
2711; SI: v_cvt_f16_f32
2712; SI: v_cvt_f16_f32
2713; SI: v_cvt_f16_f32
2714; SI: v_cvt_f16_f32
2715; SI: v_cmp_eq_u32
2716; SI: v_lshlrev_b32_e32
2717; SI: v_or_b32_e32
2718; SI: v_cndmask_b32
2719; SI: v_lshrrev_b32
2720; SI: v_cvt_f32_f16
2721; SI: v_cvt_f32_f16
2722; SI: v_cvt_f32_f16
2723; SI: v_cvt_f32_f16
2724; SI: v_sub_f32
2725; SI: v_sub_f32
2726
2727; VI: v_cmp_eq_u32_e32 vcc, 0, v0
2728; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
2729; VI-NEXT: v_sub_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2730; VI-NEXT: v_sub_f16_e32 v0, v3, v0
2731; VI-NEXT: v_or_b32_e32 v0, v0, v1
2732define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
2733  %cmp = icmp eq i32 %arg0, 0
2734  %neg.x = fneg <2 x half> %x
2735  %neg.y = fneg <2 x half> %y
2736  %select = select i1 %cmp, <2 x half> %neg.x, <2 x half> %neg.y
2737  %add = fadd <2 x half> %select, %z
2738  ret <2 x half> %add
2739}
2740
2741; FIXME: This fneg should fold into select
2742; GCN-LABEL: {{^}}v_fneg_select_f32:
2743; GCN: s_waitcnt
2744; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2745; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
2746; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2747; GCN-NEXT: s_setpc_b64
2748define float @v_fneg_select_f32(i32 %arg0, float %a, float %b, float %c) {
2749  %cond = icmp eq i32 %arg0, 0
2750  %select = select i1 %cond, float %a, float %b
2751  %fneg = fneg float %select
2752  ret float %fneg
2753}
2754
2755; FIXME: This fneg should fold into select
2756; GCN-LABEL: {{^}}v_fneg_select_2_f32:
2757; GCN: s_waitcnt
2758; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1
2759; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2
2760; GCN-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2761; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc
2762; GCN-NSZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2763
2764; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1
2765; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2
2766; GCN-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2767; GCN-SAFE-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc
2768; GCN-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2769
2770; GCN-NEXT: s_setpc_b64
2771define float @v_fneg_select_2_f32(i32 %arg0, float %a, float %b, float %c) {
2772  %cond = icmp eq i32 %arg0, 0
2773  %add.0 = fadd float %a, 2.0
2774  %add.1 = fadd float %b, 4.0
2775  %select = select i1 %cond, float %add.0, float %add.1
2776  %neg.select = fneg float %select
2777  ret float %neg.select
2778}
2779
2780; GCN-LABEL: {{^}}v_fneg_posk_select_f32:
2781; GCN: v_cmp_ne_u32_e32 vcc, 0, v0
2782; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
2783; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2784define amdgpu_kernel void @v_fneg_posk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
2785  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2786  %tid.ext = sext i32 %tid to i64
2787  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2788  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2789  %a = load volatile float, ptr addrspace(1) %a.gep
2790  %cond = icmp eq i32 %tid, 0
2791  %select = select i1 %cond, float 4.0, float %a
2792  %fneg = fneg float %select
2793  store float %fneg, ptr addrspace(1) %out.gep
2794  ret void
2795}
2796
2797; GCN-LABEL: {{^}}v_fneg_negk_select_f32:
2798; GCN: v_cmp_ne_u32_e32 vcc, 0, v0
2799; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}, vcc
2800; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2801define amdgpu_kernel void @v_fneg_negk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
2802  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2803  %tid.ext = sext i32 %tid to i64
2804  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
2805  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2806  %a = load volatile float, ptr addrspace(1) %a.gep
2807  %cond = icmp eq i32 %tid, 0
2808  %select = select i1 %cond, float -4.0, float %a
2809  %fneg = fneg float %select
2810  store float %fneg, ptr addrspace(1) %out.gep
2811  ret void
2812}
2813
2814declare i32 @llvm.amdgcn.workitem.id.x() #1
2815declare float @llvm.fma.f32(float, float, float) #1
2816declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
2817declare float @llvm.fmuladd.f32(float, float, float) #1
2818declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
2819declare float @llvm.sin.f32(float) #1
2820declare float @llvm.trunc.f32(float) #1
2821declare float @llvm.round.f32(float) #1
2822declare float @llvm.rint.f32(float) #1
2823declare float @llvm.nearbyint.f32(float) #1
2824declare float @llvm.canonicalize.f32(float) #1
2825declare float @llvm.minnum.f32(float, float) #1
2826declare float @llvm.maxnum.f32(float, float) #1
2827declare half @llvm.minnum.f16(half, half) #1
2828declare double @llvm.minnum.f64(double, double) #1
2829declare double @llvm.fma.f64(double, double, double) #1
2830
2831declare float @llvm.amdgcn.sin.f32(float) #1
2832declare float @llvm.amdgcn.rcp.f32(float) #1
2833declare float @llvm.amdgcn.rcp.legacy(float) #1
2834declare float @llvm.amdgcn.fmul.legacy(float, float) #1
2835declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
2836declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
2837
2838attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2839attributes #1 = { nounwind readnone }
2840attributes #2 = { nounwind "unsafe-fp-math"="true" }
2841attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
2842