xref: /llvm-project/llvm/test/CodeGen/AMDGPU/mad-combine.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
2
3; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD  -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
4; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
5; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
6
7; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
8
9; Make sure we don't form mad with denormals
10; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
11; RUN: llc -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
12
13declare i32 @llvm.amdgcn.workitem.id.x() #0
14declare float @llvm.fabs.f32(float) #0
15declare float @llvm.fma.f32(float, float, float) #0
16declare float @llvm.fmuladd.f32(float, float, float) #0
17
18; (fadd (fmul x, y), z) -> (fma x, y, z)
19; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
20; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
21; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
22; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
23
24; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
25
26; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
27
28; SI-DENORM-SLOWFMAF-NOT: v_fma
29; SI-DENORM-SLOWFMAF-NOT: v_mad
30
31; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
32; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]],  [[TMP]], [[C]]
33
34; SI-DENORM: buffer_store_dword [[RESULT]]
35; SI-STD: buffer_store_dword [[C]]
36define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
37  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
38  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
39  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
40  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
41  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
42
43  %a = load volatile float, ptr addrspace(1) %gep.0
44  %b = load volatile float, ptr addrspace(1) %gep.1
45  %c = load volatile float, ptr addrspace(1) %gep.2
46
47  %mul = fmul float %a, %b
48  %fma = fadd float %mul, %c
49  store float %fma, ptr addrspace(1) %gep.out
50  ret void
51}
52
53; (fadd (fmul x, y), z) -> (fma x, y, z)
54; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
55; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
56; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
57; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
58; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
59
60; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
61; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
62
63; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
64; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
65
66; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
67; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
68; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
69
70; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
71; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
72; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
73; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
74; SI: s_endpgm
75define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
76  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
77  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
78  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
79  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
80  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
81  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
82  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
83
84  %a = load volatile float, ptr addrspace(1) %gep.0
85  %b = load volatile float, ptr addrspace(1) %gep.1
86  %c = load volatile float, ptr addrspace(1) %gep.2
87  %d = load volatile float, ptr addrspace(1) %gep.3
88
89  %mul = fmul float %a, %b
90  %fma0 = fadd float %mul, %c
91  %fma1 = fadd float %mul, %d
92
93  store volatile float %fma0, ptr addrspace(1) %gep.out.0
94  store volatile float %fma1, ptr addrspace(1) %gep.out.1
95  ret void
96}
97
98; (fadd x, (fmul y, z)) -> (fma y, z, x)
99; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
100; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
101; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
102; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
103
104; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
105; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
106
107; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
108; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
109
110; SI-DENORM: buffer_store_dword [[RESULT]]
111; SI-STD: buffer_store_dword [[C]]
112define amdgpu_kernel void @combine_to_mad_f32_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
113  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
114  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
115  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
116  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
117  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
118
119  %a = load volatile float, ptr addrspace(1) %gep.0
120  %b = load volatile float, ptr addrspace(1) %gep.1
121  %c = load volatile float, ptr addrspace(1) %gep.2
122
123  %mul = fmul float %a, %b
124  %fma = fadd float %c, %mul
125  store float %fma, ptr addrspace(1) %gep.out
126  ret void
127}
128
129; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
130; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
131; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
132; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
133; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
134
135; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
136; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
137
138; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
139; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
140
141; SI: buffer_store_dword [[RESULT]]
142define amdgpu_kernel void @combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
143  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
144  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
145  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
146  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
147  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
148
149  %a = load volatile float, ptr addrspace(1) %gep.0
150  %b = load volatile float, ptr addrspace(1) %gep.1
151  %c = load volatile float, ptr addrspace(1) %gep.2
152
153  %mul = fmul float %a, %b
154  %fma = fsub float %mul, %c
155  store float %fma, ptr addrspace(1) %gep.out
156  ret void
157}
158
159; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
160; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
161; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
162; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
163; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
164; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
165
166; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
167; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
168
169; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
170; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
171
172; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
173; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
174; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
175
176; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
177; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
178; SI: s_endpgm
179define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
180  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
181  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
182  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
183  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
184  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
185  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
186  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
187
188  %a = load volatile float, ptr addrspace(1) %gep.0
189  %b = load volatile float, ptr addrspace(1) %gep.1
190  %c = load volatile float, ptr addrspace(1) %gep.2
191  %d = load volatile float, ptr addrspace(1) %gep.3
192
193  %mul = fmul float %a, %b
194  %fma0 = fsub float %mul, %c
195  %fma1 = fsub float %mul, %d
196  store volatile float %fma0, ptr addrspace(1) %gep.out.0
197  store volatile float %fma1, ptr addrspace(1) %gep.out.1
198  ret void
199}
200
201; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
202; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
203; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
204; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
205; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
206
207; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
208; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
209
210; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
211; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
212
213; SI: buffer_store_dword [[RESULT]]
214define amdgpu_kernel void @combine_to_mad_fsub_1_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
215  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
216  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
217  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
218  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
219  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
220
221  %a = load volatile float, ptr addrspace(1) %gep.0
222  %b = load volatile float, ptr addrspace(1) %gep.1
223  %c = load volatile float, ptr addrspace(1) %gep.2
224
225  %mul = fmul float %a, %b
226  %fma = fsub float %c, %mul
227  store float %fma, ptr addrspace(1) %gep.out
228  ret void
229}
230
231; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
232; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
233; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
234; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
235; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
236; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
237
238; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
239; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
240
241; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
242; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
243
244; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
245; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
246; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]],  [[D]], [[TMP]]
247
248; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
249; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
250; SI: s_endpgm
251define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
252  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
253  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
254  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
255  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
256  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
257  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
258  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
259
260  %a = load volatile float, ptr addrspace(1) %gep.0
261  %b = load volatile float, ptr addrspace(1) %gep.1
262  %c = load volatile float, ptr addrspace(1) %gep.2
263  %d = load volatile float, ptr addrspace(1) %gep.3
264
265  %mul = fmul float %a, %b
266  %fma0 = fsub float %c, %mul
267  %fma1 = fsub float %d, %mul
268  store volatile float %fma0, ptr addrspace(1) %gep.out.0
269  store volatile float %fma1, ptr addrspace(1) %gep.out.1
270  ret void
271}
272
273; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
274; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
275; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
276; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
277; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
278
279; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
280
281; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
282
283; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
284; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
285
286; SI: buffer_store_dword [[RESULT]]
287define amdgpu_kernel void @combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
288  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
289  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
290  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
291  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
292  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
293
294  %a = load volatile float, ptr addrspace(1) %gep.0
295  %b = load volatile float, ptr addrspace(1) %gep.1
296  %c = load volatile float, ptr addrspace(1) %gep.2
297
298  %mul = fmul float %a, %b
299  %mul.neg = fneg float %mul
300  %fma = fsub float %mul.neg, %c
301
302  store float %fma, ptr addrspace(1) %gep.out
303  ret void
304}
305
306; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
307; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
308; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
309; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
310; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
311; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
312
313; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
314; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
315
316; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
317; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
318
319; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
320; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
321; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]],  [[TMP]], [[D]]
322
323; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
324; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
325; SI: s_endpgm
326define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
327  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
328  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
329  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
330  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
331  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
332  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
333  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
334
335  %a = load volatile float, ptr addrspace(1) %gep.0
336  %b = load volatile float, ptr addrspace(1) %gep.1
337  %c = load volatile float, ptr addrspace(1) %gep.2
338  %d = load volatile float, ptr addrspace(1) %gep.3
339
340  %mul = fmul float %a, %b
341  %mul.neg = fneg float %mul
342  %fma0 = fsub float %mul.neg, %c
343  %fma1 = fsub float %mul.neg, %d
344
345  store volatile float %fma0, ptr addrspace(1) %gep.out.0
346  store volatile float %fma1, ptr addrspace(1) %gep.out.1
347  ret void
348}
349
350; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
351; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
352; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
353; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
354; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
355; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
356
357; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
358; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
359
360; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
361; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
362
363; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
364; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
365; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
366
367; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
368; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
369; SI: s_endpgm
370define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
371  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
372  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
373  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
374  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
375  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
376  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
377  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
378
379  %a = load volatile float, ptr addrspace(1) %gep.0
380  %b = load volatile float, ptr addrspace(1) %gep.1
381  %c = load volatile float, ptr addrspace(1) %gep.2
382  %d = load volatile float, ptr addrspace(1) %gep.3
383
384  %mul = fmul float %a, %b
385  %mul.neg = fneg float %mul
386  %fma0 = fsub float %mul.neg, %c
387  %fma1 = fsub float %mul, %d
388
389  store volatile float %fma0, ptr addrspace(1) %gep.out.0
390  store volatile float %fma1, ptr addrspace(1) %gep.out.1
391  ret void
392}
393
394; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
395
396; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
397; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
398; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
399; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
400; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
401; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
402
403; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
404; SI-STD-SAFE: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
405; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
406
407; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
408; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]]
409
410; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
411; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
412; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
413
414; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
415define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
416  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
417  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
418  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
419  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
420  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
421  %gep.4 = getelementptr float, ptr addrspace(1) %gep.0, i32 4
422  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
423
424  %x = load volatile float, ptr addrspace(1) %gep.0
425  %y = load volatile float, ptr addrspace(1) %gep.1
426  %z = load volatile float, ptr addrspace(1) %gep.2
427  %u = load volatile float, ptr addrspace(1) %gep.3
428  %v = load volatile float, ptr addrspace(1) %gep.4
429
430  %tmp0 = fmul float %u, %v
431  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
432  %tmp2 = fsub float %tmp1, %z
433
434  store float %tmp2, ptr addrspace(1) %gep.out
435  ret void
436}
437
438; fold (fsub x, (fma y, z, (fmul u, v)))
439;   -> (fma (fneg y), z, (fma (fneg u), v, x))
440
441; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
442; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
443; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
444; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
445; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
446; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
447
448; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
449; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
450; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
451
452; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
453; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
454; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
455
456; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
457; SI: s_endpgm
458define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
459  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
460  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
461  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
462  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
463  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
464  %gep.4 = getelementptr float, ptr addrspace(1) %gep.0, i32 4
465  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
466
467  %x = load volatile float, ptr addrspace(1) %gep.0
468  %y = load volatile float, ptr addrspace(1) %gep.1
469  %z = load volatile float, ptr addrspace(1) %gep.2
470  %u = load volatile float, ptr addrspace(1) %gep.3
471  %v = load volatile float, ptr addrspace(1) %gep.4
472
473  %tmp0 = fmul float %u, %v
474  %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
475  %tmp2 = fsub float %x, %tmp1
476
477  store float %tmp2, ptr addrspace(1) %gep.out
478  ret void
479}
480
481; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
482
483; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
484; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
485; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
486; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
487; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
488; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
489
490; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
491; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
492; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]]
493
494; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
495; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]]
496
497; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
498; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
499; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]],  [[TMP1]], [[C]]
500
501; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
502; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]]
503; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
504; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]]
505
506; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
507; SI: s_endpgm
508define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
509  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
510  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
511  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
512  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
513  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
514  %gep.4 = getelementptr float, ptr addrspace(1) %gep.0, i32 4
515  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
516
517  %x = load volatile float, ptr addrspace(1) %gep.0
518  %y = load volatile float, ptr addrspace(1) %gep.1
519  %z = load volatile float, ptr addrspace(1) %gep.2
520  %u = load volatile float, ptr addrspace(1) %gep.3
521  %v = load volatile float, ptr addrspace(1) %gep.4
522
523  %tmp0 = fmul float %u, %v
524  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
525  %tmp2 = fsub float %tmp1, %z
526
527  store float %tmp2, ptr addrspace(1) %gep.out
528  ret void
529}
530
531; fold (fsub x, (fmuladd y, z, (fmul u, v)))
532;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
533
534; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
535; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
536; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
537; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
538; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
539; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
540
541; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
542; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
543; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]]
544
545; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
546; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
547
548; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
549; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
550; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
551
552; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
553; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]]
554; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
555; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]]
556
557; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
558; SI: s_endpgm
559define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
560  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
561  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
562  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
563  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
564  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
565  %gep.4 = getelementptr float, ptr addrspace(1) %gep.0, i32 4
566  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
567
568  %x = load volatile float, ptr addrspace(1) %gep.0
569  %y = load volatile float, ptr addrspace(1) %gep.1
570  %z = load volatile float, ptr addrspace(1) %gep.2
571  %u = load volatile float, ptr addrspace(1) %gep.3
572  %v = load volatile float, ptr addrspace(1) %gep.4
573
574  ; nsz flag is needed since this combine may change sign of zero
575  %tmp0 = fmul nsz float %u, %v
576  %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
577  %tmp2 = fsub nsz float %x, %tmp1
578
579  store float %tmp2, ptr addrspace(1) %gep.out
580  ret void
581}
582
583attributes #0 = { nounwind readnone }
584attributes #1 = { nounwind }
585