xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fma-combine.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
7
8; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
9
10; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
11; beneficial even without fp32 denormals, but they do require no-infs-fp-math
12; for correctness.
13
14declare i32 @llvm.amdgcn.workitem.id.x() #0
15declare double @llvm.fabs.f64(double) #0
16declare double @llvm.fma.f64(double, double, double) #0
17declare float @llvm.fma.f32(float, float, float) #0
18declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
19
20; (fadd (fmul x, y), z) -> (fma x, y, z)
21define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
22; SI-LABEL: combine_to_fma_f64_0:
23; SI:       ; %bb.0:
24; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
25; SI-NEXT:    s_mov_b32 s7, 0xf000
26; SI-NEXT:    s_mov_b32 s6, 0
27; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
28; SI-NEXT:    v_mov_b32_e32 v1, 0
29; SI-NEXT:    s_waitcnt lgkmcnt(0)
30; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
31; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
32; SI-NEXT:    s_waitcnt vmcnt(0)
33; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
34; SI-NEXT:    s_waitcnt vmcnt(0)
35; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
36; SI-NEXT:    s_waitcnt vmcnt(0)
37; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
38; SI-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
39; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
40; SI-NEXT:    s_endpgm
41;
42; GFX11-LABEL: combine_to_fma_f64_0:
43; GFX11:       ; %bb.0:
44; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
45; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
46; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
47; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
48; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
50; GFX11-NEXT:    s_waitcnt vmcnt(0)
51; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
52; GFX11-NEXT:    s_waitcnt vmcnt(0)
53; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
54; GFX11-NEXT:    s_waitcnt vmcnt(0)
55; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
56; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
57; GFX11-NEXT:    s_endpgm
58  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
59  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
60  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
61  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
62  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
63
64  %a = load volatile double, ptr addrspace(1) %gep.0
65  %b = load volatile double, ptr addrspace(1) %gep.1
66  %c = load volatile double, ptr addrspace(1) %gep.2
67
68  %mul = fmul double %a, %b
69  %fma = fadd double %mul, %c
70  store double %fma, ptr addrspace(1) %gep.out
71  ret void
72}
73
74; (fadd (fmul x, y), z) -> (fma x, y, z)
75define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
76; SI-LABEL: combine_to_fma_f64_0_2use:
77; SI:       ; %bb.0:
78; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
79; SI-NEXT:    s_mov_b32 s7, 0xf000
80; SI-NEXT:    s_mov_b32 s6, 0
81; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
82; SI-NEXT:    v_mov_b32_e32 v1, 0
83; SI-NEXT:    s_waitcnt lgkmcnt(0)
84; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
85; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
86; SI-NEXT:    s_waitcnt vmcnt(0)
87; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
88; SI-NEXT:    s_waitcnt vmcnt(0)
89; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
90; SI-NEXT:    s_waitcnt vmcnt(0)
91; SI-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
92; SI-NEXT:    s_waitcnt vmcnt(0)
93; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
94; SI-NEXT:    v_fma_f64 v[6:7], v[2:3], v[4:5], v[6:7]
95; SI-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
96; SI-NEXT:    buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
97; SI-NEXT:    s_waitcnt vmcnt(0)
98; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
99; SI-NEXT:    s_waitcnt vmcnt(0)
100; SI-NEXT:    s_endpgm
101;
102; GFX11-LABEL: combine_to_fma_f64_0_2use:
103; GFX11:       ; %bb.0:
104; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
105; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
106; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
107; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
108; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
110; GFX11-NEXT:    s_waitcnt vmcnt(0)
111; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
112; GFX11-NEXT:    s_waitcnt vmcnt(0)
113; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
114; GFX11-NEXT:    s_waitcnt vmcnt(0)
115; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
116; GFX11-NEXT:    s_waitcnt vmcnt(0)
117; GFX11-NEXT:    v_fma_f64 v[4:5], v[0:1], v[2:3], v[4:5]
118; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
119; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
120; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
121; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
122; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
123; GFX11-NEXT:    s_endpgm
124  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
125  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
126  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
127  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
128  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
129  %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
130  %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
131
132  %a = load volatile double, ptr addrspace(1) %gep.0
133  %b = load volatile double, ptr addrspace(1) %gep.1
134  %c = load volatile double, ptr addrspace(1) %gep.2
135  %d = load volatile double, ptr addrspace(1) %gep.3
136
137  %mul = fmul double %a, %b
138  %fma0 = fadd double %mul, %c
139  %fma1 = fadd double %mul, %d
140  store volatile double %fma0, ptr addrspace(1) %gep.out.0
141  store volatile double %fma1, ptr addrspace(1) %gep.out.1
142  ret void
143}
144
145; (fadd x, (fmul y, z)) -> (fma y, z, x)
146define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
147; SI-LABEL: combine_to_fma_f64_1:
148; SI:       ; %bb.0:
149; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
150; SI-NEXT:    s_mov_b32 s7, 0xf000
151; SI-NEXT:    s_mov_b32 s6, 0
152; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
153; SI-NEXT:    v_mov_b32_e32 v1, 0
154; SI-NEXT:    s_waitcnt lgkmcnt(0)
155; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
156; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
157; SI-NEXT:    s_waitcnt vmcnt(0)
158; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
159; SI-NEXT:    s_waitcnt vmcnt(0)
160; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
161; SI-NEXT:    s_waitcnt vmcnt(0)
162; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
163; SI-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
164; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
165; SI-NEXT:    s_endpgm
166;
167; GFX11-LABEL: combine_to_fma_f64_1:
168; GFX11:       ; %bb.0:
169; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
170; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
171; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
172; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
173; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
174; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
175; GFX11-NEXT:    s_waitcnt vmcnt(0)
176; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
177; GFX11-NEXT:    s_waitcnt vmcnt(0)
178; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
179; GFX11-NEXT:    s_waitcnt vmcnt(0)
180; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
181; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
182; GFX11-NEXT:    s_endpgm
183  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
184  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
185  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
186  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
187  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
188
189  %a = load volatile double, ptr addrspace(1) %gep.0
190  %b = load volatile double, ptr addrspace(1) %gep.1
191  %c = load volatile double, ptr addrspace(1) %gep.2
192
193  %mul = fmul double %a, %b
194  %fma = fadd double %c, %mul
195  store double %fma, ptr addrspace(1) %gep.out
196  ret void
197}
198
199; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
200define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
201; SI-LABEL: combine_to_fma_fsub_0_f64:
202; SI:       ; %bb.0:
203; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
204; SI-NEXT:    s_mov_b32 s7, 0xf000
205; SI-NEXT:    s_mov_b32 s6, 0
206; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
207; SI-NEXT:    v_mov_b32_e32 v1, 0
208; SI-NEXT:    s_waitcnt lgkmcnt(0)
209; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
210; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
211; SI-NEXT:    s_waitcnt vmcnt(0)
212; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
213; SI-NEXT:    s_waitcnt vmcnt(0)
214; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
215; SI-NEXT:    s_waitcnt vmcnt(0)
216; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
217; SI-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], -v[6:7]
218; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
219; SI-NEXT:    s_endpgm
220;
221; GFX11-LABEL: combine_to_fma_fsub_0_f64:
222; GFX11:       ; %bb.0:
223; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
224; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
225; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
226; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
227; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
229; GFX11-NEXT:    s_waitcnt vmcnt(0)
230; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
231; GFX11-NEXT:    s_waitcnt vmcnt(0)
232; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
233; GFX11-NEXT:    s_waitcnt vmcnt(0)
234; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
235; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
236; GFX11-NEXT:    s_endpgm
237  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
238  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
239  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
240  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
241  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
242
243  %a = load volatile double, ptr addrspace(1) %gep.0
244  %b = load volatile double, ptr addrspace(1) %gep.1
245  %c = load volatile double, ptr addrspace(1) %gep.2
246
247  %mul = fmul double %a, %b
248  %fma = fsub double %mul, %c
249  store double %fma, ptr addrspace(1) %gep.out
250  ret void
251}
252
253; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
254define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
255; SI-LABEL: combine_to_fma_fsub_f64_0_2use:
256; SI:       ; %bb.0:
257; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
258; SI-NEXT:    s_mov_b32 s7, 0xf000
259; SI-NEXT:    s_mov_b32 s6, 0
260; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
261; SI-NEXT:    v_mov_b32_e32 v1, 0
262; SI-NEXT:    s_waitcnt lgkmcnt(0)
263; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
264; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
265; SI-NEXT:    s_waitcnt vmcnt(0)
266; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
267; SI-NEXT:    s_waitcnt vmcnt(0)
268; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
269; SI-NEXT:    s_waitcnt vmcnt(0)
270; SI-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
271; SI-NEXT:    s_waitcnt vmcnt(0)
272; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
273; SI-NEXT:    v_fma_f64 v[6:7], v[2:3], v[4:5], -v[6:7]
274; SI-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9]
275; SI-NEXT:    buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
276; SI-NEXT:    s_waitcnt vmcnt(0)
277; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
278; SI-NEXT:    s_waitcnt vmcnt(0)
279; SI-NEXT:    s_endpgm
280;
281; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use:
282; GFX11:       ; %bb.0:
283; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
284; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
285; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
286; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
287; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
289; GFX11-NEXT:    s_waitcnt vmcnt(0)
290; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
291; GFX11-NEXT:    s_waitcnt vmcnt(0)
292; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
293; GFX11-NEXT:    s_waitcnt vmcnt(0)
294; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
295; GFX11-NEXT:    s_waitcnt vmcnt(0)
296; GFX11-NEXT:    v_fma_f64 v[4:5], v[0:1], v[2:3], -v[4:5]
297; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
298; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
299; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
300; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
301; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
302; GFX11-NEXT:    s_endpgm
303  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
304  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
305  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
306  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
307  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
308  %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
309  %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
310
311  %a = load volatile double, ptr addrspace(1) %gep.0
312  %b = load volatile double, ptr addrspace(1) %gep.1
313  %c = load volatile double, ptr addrspace(1) %gep.2
314  %d = load volatile double, ptr addrspace(1) %gep.3
315
316  %mul = fmul double %a, %b
317  %fma0 = fsub double %mul, %c
318  %fma1 = fsub double %mul, %d
319  store volatile double %fma0, ptr addrspace(1) %gep.out.0
320  store volatile double %fma1, ptr addrspace(1) %gep.out.1
321  ret void
322}
323
324; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
325define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
326; SI-LABEL: combine_to_fma_fsub_1_f64:
327; SI:       ; %bb.0:
328; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
329; SI-NEXT:    s_mov_b32 s7, 0xf000
330; SI-NEXT:    s_mov_b32 s6, 0
331; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
332; SI-NEXT:    v_mov_b32_e32 v1, 0
333; SI-NEXT:    s_waitcnt lgkmcnt(0)
334; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
335; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
336; SI-NEXT:    s_waitcnt vmcnt(0)
337; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
338; SI-NEXT:    s_waitcnt vmcnt(0)
339; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
340; SI-NEXT:    s_waitcnt vmcnt(0)
341; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
342; SI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], v[6:7]
343; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
344; SI-NEXT:    s_endpgm
345;
346; GFX11-LABEL: combine_to_fma_fsub_1_f64:
347; GFX11:       ; %bb.0:
348; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
349; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
350; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
351; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
352; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
354; GFX11-NEXT:    s_waitcnt vmcnt(0)
355; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
356; GFX11-NEXT:    s_waitcnt vmcnt(0)
357; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
358; GFX11-NEXT:    s_waitcnt vmcnt(0)
359; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
360; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
361; GFX11-NEXT:    s_endpgm
362  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
363  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
364  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
365  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
366  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
367
368  %a = load volatile double, ptr addrspace(1) %gep.0
369  %b = load volatile double, ptr addrspace(1) %gep.1
370  %c = load volatile double, ptr addrspace(1) %gep.2
371
372  %mul = fmul double %a, %b
373  %fma = fsub double %c, %mul
374  store double %fma, ptr addrspace(1) %gep.out
375  ret void
376}
377
378; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
379define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
380; SI-LABEL: combine_to_fma_fsub_1_f64_2use:
381; SI:       ; %bb.0:
382; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
383; SI-NEXT:    s_mov_b32 s7, 0xf000
384; SI-NEXT:    s_mov_b32 s6, 0
385; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
386; SI-NEXT:    v_mov_b32_e32 v1, 0
387; SI-NEXT:    s_waitcnt lgkmcnt(0)
388; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
389; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
390; SI-NEXT:    s_waitcnt vmcnt(0)
391; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
392; SI-NEXT:    s_waitcnt vmcnt(0)
393; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
394; SI-NEXT:    s_waitcnt vmcnt(0)
395; SI-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
396; SI-NEXT:    s_waitcnt vmcnt(0)
397; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
398; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], v[6:7]
399; SI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], v[8:9]
400; SI-NEXT:    buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
401; SI-NEXT:    s_waitcnt vmcnt(0)
402; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
403; SI-NEXT:    s_waitcnt vmcnt(0)
404; SI-NEXT:    s_endpgm
405;
406; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use:
407; GFX11:       ; %bb.0:
408; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
409; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
410; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
411; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
412; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
414; GFX11-NEXT:    s_waitcnt vmcnt(0)
415; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
416; GFX11-NEXT:    s_waitcnt vmcnt(0)
417; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
418; GFX11-NEXT:    s_waitcnt vmcnt(0)
419; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
420; GFX11-NEXT:    s_waitcnt vmcnt(0)
421; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], v[4:5]
422; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[6:7]
423; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
424; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
425; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
426; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
427; GFX11-NEXT:    s_endpgm
428  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
429  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
430  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
431  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
432  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
433  %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
434  %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
435
436  %a = load volatile double, ptr addrspace(1) %gep.0
437  %b = load volatile double, ptr addrspace(1) %gep.1
438  %c = load volatile double, ptr addrspace(1) %gep.2
439  %d = load volatile double, ptr addrspace(1) %gep.3
440
441  %mul = fmul double %a, %b
442  %fma0 = fsub double %c, %mul
443  %fma1 = fsub double %d, %mul
444  store volatile double %fma0, ptr addrspace(1) %gep.out.0
445  store volatile double %fma1, ptr addrspace(1) %gep.out.1
446  ret void
447}
448
449; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
450define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
451; SI-LABEL: combine_to_fma_fsub_2_f64:
452; SI:       ; %bb.0:
453; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
454; SI-NEXT:    s_mov_b32 s7, 0xf000
455; SI-NEXT:    s_mov_b32 s6, 0
456; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
457; SI-NEXT:    v_mov_b32_e32 v1, 0
458; SI-NEXT:    s_waitcnt lgkmcnt(0)
459; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
460; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
461; SI-NEXT:    s_waitcnt vmcnt(0)
462; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
463; SI-NEXT:    s_waitcnt vmcnt(0)
464; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
465; SI-NEXT:    s_waitcnt vmcnt(0)
466; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
467; SI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[6:7]
468; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
469; SI-NEXT:    s_endpgm
470;
471; GFX11-LABEL: combine_to_fma_fsub_2_f64:
472; GFX11:       ; %bb.0:
473; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
474; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
475; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
476; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
477; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
478; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
479; GFX11-NEXT:    s_waitcnt vmcnt(0)
480; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
481; GFX11-NEXT:    s_waitcnt vmcnt(0)
482; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
483; GFX11-NEXT:    s_waitcnt vmcnt(0)
484; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5]
485; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
486; GFX11-NEXT:    s_endpgm
487  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
488  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
489  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
490  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
491  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
492
493  %a = load volatile double, ptr addrspace(1) %gep.0
494  %b = load volatile double, ptr addrspace(1) %gep.1
495  %c = load volatile double, ptr addrspace(1) %gep.2
496
497  %mul = fmul double %a, %b
498  %mul.neg = fsub double -0.0, %mul
499  %fma = fsub double %mul.neg, %c
500
501  store double %fma, ptr addrspace(1) %gep.out
502  ret void
503}
504
505; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
506define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
507; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg:
508; SI:       ; %bb.0:
509; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
510; SI-NEXT:    s_mov_b32 s7, 0xf000
511; SI-NEXT:    s_mov_b32 s6, 0
512; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
513; SI-NEXT:    v_mov_b32_e32 v1, 0
514; SI-NEXT:    s_waitcnt lgkmcnt(0)
515; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
516; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
517; SI-NEXT:    s_waitcnt vmcnt(0)
518; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
519; SI-NEXT:    s_waitcnt vmcnt(0)
520; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
521; SI-NEXT:    s_waitcnt vmcnt(0)
522; SI-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
523; SI-NEXT:    s_waitcnt vmcnt(0)
524; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
525; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7]
526; SI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[8:9]
527; SI-NEXT:    buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
528; SI-NEXT:    s_waitcnt vmcnt(0)
529; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
530; SI-NEXT:    s_waitcnt vmcnt(0)
531; SI-NEXT:    s_endpgm
532;
533; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg:
534; GFX11:       ; %bb.0:
535; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
536; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
537; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
538; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
539; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
541; GFX11-NEXT:    s_waitcnt vmcnt(0)
542; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
543; GFX11-NEXT:    s_waitcnt vmcnt(0)
544; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
545; GFX11-NEXT:    s_waitcnt vmcnt(0)
546; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
547; GFX11-NEXT:    s_waitcnt vmcnt(0)
548; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
549; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[6:7]
550; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
551; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
552; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
553; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
554; GFX11-NEXT:    s_endpgm
555  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
556  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
557  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
558  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
559  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
560  %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
561  %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
562
563  %a = load volatile double, ptr addrspace(1) %gep.0
564  %b = load volatile double, ptr addrspace(1) %gep.1
565  %c = load volatile double, ptr addrspace(1) %gep.2
566  %d = load volatile double, ptr addrspace(1) %gep.3
567
568  %mul = fmul double %a, %b
569  %mul.neg = fsub double -0.0, %mul
570  %fma0 = fsub double %mul.neg, %c
571  %fma1 = fsub double %mul.neg, %d
572
573  store volatile double %fma0, ptr addrspace(1) %gep.out.0
574  store volatile double %fma1, ptr addrspace(1) %gep.out.1
575  ret void
576}
577
578; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
579define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
580; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul:
581; SI:       ; %bb.0:
582; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
583; SI-NEXT:    s_mov_b32 s7, 0xf000
584; SI-NEXT:    s_mov_b32 s6, 0
585; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
586; SI-NEXT:    v_mov_b32_e32 v1, 0
587; SI-NEXT:    s_waitcnt lgkmcnt(0)
588; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
589; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
590; SI-NEXT:    s_waitcnt vmcnt(0)
591; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
592; SI-NEXT:    s_waitcnt vmcnt(0)
593; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
594; SI-NEXT:    s_waitcnt vmcnt(0)
595; SI-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
596; SI-NEXT:    s_waitcnt vmcnt(0)
597; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
598; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7]
599; SI-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9]
600; SI-NEXT:    buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
601; SI-NEXT:    s_waitcnt vmcnt(0)
602; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
603; SI-NEXT:    s_waitcnt vmcnt(0)
604; SI-NEXT:    s_endpgm
605;
606; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul:
607; GFX11:       ; %bb.0:
608; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
609; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
610; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
611; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
612; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
613; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
614; GFX11-NEXT:    s_waitcnt vmcnt(0)
615; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
616; GFX11-NEXT:    s_waitcnt vmcnt(0)
617; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
618; GFX11-NEXT:    s_waitcnt vmcnt(0)
619; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
620; GFX11-NEXT:    s_waitcnt vmcnt(0)
621; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
622; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
623; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
624; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
625; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
626; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
627; GFX11-NEXT:    s_endpgm
628  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
629  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
630  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
631  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
632  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
633  %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
634  %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1
635
636  %a = load volatile double, ptr addrspace(1) %gep.0
637  %b = load volatile double, ptr addrspace(1) %gep.1
638  %c = load volatile double, ptr addrspace(1) %gep.2
639  %d = load volatile double, ptr addrspace(1) %gep.3
640
641  %mul = fmul double %a, %b
642  %mul.neg = fsub double -0.0, %mul
643  %fma0 = fsub double %mul.neg, %c
644  %fma1 = fsub double %mul, %d
645
646  store volatile double %fma0, ptr addrspace(1) %gep.out.0
647  store volatile double %fma1, ptr addrspace(1) %gep.out.1
648  ret void
649}
650
651; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
652define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
653; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
654; SI-NOFMA:       ; %bb.0:
655; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
656; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
657; SI-NOFMA-NEXT:    s_mov_b32 s6, 0
658; SI-NOFMA-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
659; SI-NOFMA-NEXT:    v_mov_b32_e32 v1, 0
660; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
661; SI-NOFMA-NEXT:    s_mov_b64 s[4:5], s[2:3]
662; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
663; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
664; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
665; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
666; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
667; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
668; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
669; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
670; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
671; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
672; SI-NOFMA-NEXT:    s_mov_b64 s[2:3], s[6:7]
673; SI-NOFMA-NEXT:    v_mul_f64 v[8:9], v[8:9], v[10:11]
674; SI-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
675; SI-NOFMA-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
676; SI-NOFMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
677; SI-NOFMA-NEXT:    s_endpgm
678;
679; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
680; SI-FMA:       ; %bb.0:
681; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
682; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
683; SI-FMA-NEXT:    s_mov_b32 s6, 0
684; SI-FMA-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
685; SI-FMA-NEXT:    v_mov_b32_e32 v1, 0
686; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
687; SI-FMA-NEXT:    s_mov_b64 s[4:5], s[2:3]
688; SI-FMA-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
689; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
690; SI-FMA-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
691; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
692; SI-FMA-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
693; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
694; SI-FMA-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
695; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
696; SI-FMA-NEXT:    buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
697; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
698; SI-FMA-NEXT:    s_mov_b64 s[2:3], s[6:7]
699; SI-FMA-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], -v[6:7]
700; SI-FMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
701; SI-FMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
702; SI-FMA-NEXT:    s_endpgm
703;
704; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
705; GFX11-NOFMA:       ; %bb.0:
706; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
707; GFX11-NOFMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
708; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
709; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
710; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
711; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
712; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
713; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
714; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
715; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
716; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
717; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
718; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
719; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
720; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
721; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
722; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
723; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
724; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
725; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
726; GFX11-NOFMA-NEXT:    s_endpgm
727;
728; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
729; GFX11-FMA:       ; %bb.0:
730; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
731; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
732; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
733; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
734; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
735; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
736; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
737; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
738; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
739; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
740; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
741; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
742; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
743; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
744; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
745; GFX11-FMA-NEXT:    v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5]
746; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
747; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
748; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
749; GFX11-FMA-NEXT:    s_endpgm
750  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
751  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
752  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
753  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
754  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
755  %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4
756  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
757
758  %x = load volatile double, ptr addrspace(1) %gep.0
759  %y = load volatile double, ptr addrspace(1) %gep.1
760  %z = load volatile double, ptr addrspace(1) %gep.2
761  %u = load volatile double, ptr addrspace(1) %gep.3
762  %v = load volatile double, ptr addrspace(1) %gep.4
763
764  %tmp0 = fmul double %u, %v
765  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
766  %tmp2 = fsub double %tmp1, %z
767
768  store double %tmp2, ptr addrspace(1) %gep.out
769  ret void
770}
771
772; fold (fsub x, (fma y, z, (fmul u, v)))
773;   -> (fma (fneg y), z, (fma (fneg u), v, x))
774define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
775; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
776; SI-NOFMA:       ; %bb.0:
777; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
778; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
779; SI-NOFMA-NEXT:    s_mov_b32 s6, 0
780; SI-NOFMA-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
781; SI-NOFMA-NEXT:    v_mov_b32_e32 v1, 0
782; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
783; SI-NOFMA-NEXT:    s_mov_b64 s[4:5], s[2:3]
784; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
785; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
786; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
787; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
788; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
789; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
790; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
791; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
792; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
793; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
794; SI-NOFMA-NEXT:    s_mov_b64 s[2:3], s[6:7]
795; SI-NOFMA-NEXT:    v_mul_f64 v[8:9], v[8:9], v[10:11]
796; SI-NOFMA-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
797; SI-NOFMA-NEXT:    v_add_f64 v[2:3], v[2:3], -v[4:5]
798; SI-NOFMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
799; SI-NOFMA-NEXT:    s_endpgm
800;
801; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
802; SI-FMA:       ; %bb.0:
803; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
804; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
805; SI-FMA-NEXT:    s_mov_b32 s6, 0
806; SI-FMA-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
807; SI-FMA-NEXT:    v_mov_b32_e32 v1, 0
808; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
809; SI-FMA-NEXT:    s_mov_b64 s[4:5], s[2:3]
810; SI-FMA-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
811; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
812; SI-FMA-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
813; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
814; SI-FMA-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
815; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
816; SI-FMA-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
817; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
818; SI-FMA-NEXT:    buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
819; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
820; SI-FMA-NEXT:    s_mov_b64 s[2:3], s[6:7]
821; SI-FMA-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[10:11], v[2:3]
822; SI-FMA-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[6:7], v[2:3]
823; SI-FMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
824; SI-FMA-NEXT:    s_endpgm
825;
826; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
827; GFX11-NOFMA:       ; %bb.0:
828; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
829; GFX11-NOFMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
830; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
831; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
832; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
833; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
834; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
835; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
836; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
837; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
838; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
839; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
840; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
841; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
842; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
843; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
844; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
845; GFX11-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
846; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
847; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
848; GFX11-NOFMA-NEXT:    s_endpgm
849;
850; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
851; GFX11-FMA:       ; %bb.0:
852; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
853; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
854; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
855; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
856; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
857; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
858; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
859; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
860; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
861; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
862; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
863; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
864; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
865; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
866; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
867; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1]
868; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
869; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1]
870; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
871; GFX11-FMA-NEXT:    s_endpgm
872  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
873  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
874  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
875  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
876  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
877  %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4
878  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
879
880  %x = load volatile double, ptr addrspace(1) %gep.0
881  %y = load volatile double, ptr addrspace(1) %gep.1
882  %z = load volatile double, ptr addrspace(1) %gep.2
883  %u = load volatile double, ptr addrspace(1) %gep.3
884  %v = load volatile double, ptr addrspace(1) %gep.4
885
886  ; nsz flag is needed since this combine may change sign of zero
887  %tmp0 = fmul nsz double %u, %v
888  %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
889  %tmp2 = fsub nsz double %x, %tmp1
890
891  store double %tmp2, ptr addrspace(1) %gep.out
892  ret void
893}
894
895;
896; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
897;
898
899define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
900; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y:
901; SI-NOFMA:       ; %bb.0:
902; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
903; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
904; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
905; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
906; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
907; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
908; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
909; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
910; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
911; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
912; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
913; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
914; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
915; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0 glc
916; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
917; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
918; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
919; SI-NOFMA-NEXT:    v_add_f32_e32 v0, 1.0, v0
920; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
921; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
922; SI-NOFMA-NEXT:    s_endpgm
923;
924; SI-FMA-LABEL: test_f32_mul_add_x_one_y:
925; SI-FMA:       ; %bb.0:
926; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
927; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
928; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
929; SI-FMA-NEXT:    s_mov_b32 s6, -1
930; SI-FMA-NEXT:    s_mov_b32 s14, s6
931; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
932; SI-FMA-NEXT:    s_mov_b32 s12, s2
933; SI-FMA-NEXT:    s_mov_b32 s13, s3
934; SI-FMA-NEXT:    s_mov_b32 s15, s7
935; SI-FMA-NEXT:    s_mov_b32 s10, s6
936; SI-FMA-NEXT:    s_mov_b32 s11, s7
937; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
938; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
939; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0 glc
940; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
941; SI-FMA-NEXT:    s_mov_b32 s4, s0
942; SI-FMA-NEXT:    s_mov_b32 s5, s1
943; SI-FMA-NEXT:    v_fma_f32 v0, v0, v1, v1
944; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
945; SI-FMA-NEXT:    s_endpgm
946;
947; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y:
948; GFX11-NOFMA:       ; %bb.0:
949; GFX11-NOFMA-NEXT:    s_clause 0x1
950; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
951; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
952; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
953; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
955; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
956; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
957; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
958; GFX11-NOFMA-NEXT:    v_add_f32_e32 v1, 1.0, v1
959; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
960; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
961; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
962; GFX11-NOFMA-NEXT:    s_endpgm
963;
964; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y:
965; GFX11-FMA:       ; %bb.0:
966; GFX11-FMA-NEXT:    s_clause 0x1
967; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
968; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
969; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
970; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
971; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
972; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
973; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
974; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
975; GFX11-FMA-NEXT:    v_fmac_f32_e32 v2, v1, v2
976; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[0:1]
977; GFX11-FMA-NEXT:    s_endpgm
978                                        ptr addrspace(1) %in1,
979                                        ptr addrspace(1) %in2) {
980  %x = load volatile float, ptr addrspace(1) %in1
981  %y = load volatile float, ptr addrspace(1) %in2
982  %a = fadd float %x, 1.0
983  %m = fmul float %a, %y
984  store float %m, ptr addrspace(1) %out
985  ret void
986}
987
988define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
989; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one:
990; SI-NOFMA:       ; %bb.0:
991; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
992; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
993; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
994; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
995; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
996; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
997; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
998; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
999; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1000; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1001; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1002; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
1003; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1004; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0 glc
1005; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1006; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1007; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1008; SI-NOFMA-NEXT:    v_add_f32_e32 v0, 1.0, v0
1009; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v1, v0
1010; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1011; SI-NOFMA-NEXT:    s_endpgm
1012;
1013; SI-FMA-LABEL: test_f32_mul_y_add_x_one:
1014; SI-FMA:       ; %bb.0:
1015; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1016; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1017; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1018; SI-FMA-NEXT:    s_mov_b32 s6, -1
1019; SI-FMA-NEXT:    s_mov_b32 s14, s6
1020; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1021; SI-FMA-NEXT:    s_mov_b32 s12, s2
1022; SI-FMA-NEXT:    s_mov_b32 s13, s3
1023; SI-FMA-NEXT:    s_mov_b32 s15, s7
1024; SI-FMA-NEXT:    s_mov_b32 s10, s6
1025; SI-FMA-NEXT:    s_mov_b32 s11, s7
1026; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
1027; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1028; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0 glc
1029; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1030; SI-FMA-NEXT:    s_mov_b32 s4, s0
1031; SI-FMA-NEXT:    s_mov_b32 s5, s1
1032; SI-FMA-NEXT:    v_fma_f32 v0, v0, v1, v1
1033; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1034; SI-FMA-NEXT:    s_endpgm
1035;
1036; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one:
1037; GFX11-NOFMA:       ; %bb.0:
1038; GFX11-NOFMA-NEXT:    s_clause 0x1
1039; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1040; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1041; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1042; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1043; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
1044; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1045; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
1046; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1047; GFX11-NOFMA-NEXT:    v_add_f32_e32 v1, 1.0, v1
1048; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1049; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
1050; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1051; GFX11-NOFMA-NEXT:    s_endpgm
1052;
1053; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one:
1054; GFX11-FMA:       ; %bb.0:
1055; GFX11-FMA-NEXT:    s_clause 0x1
1056; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1057; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1058; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1059; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1060; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
1061; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1062; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
1063; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1064; GFX11-FMA-NEXT:    v_fmac_f32_e32 v2, v1, v2
1065; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[0:1]
1066; GFX11-FMA-NEXT:    s_endpgm
1067                                        ptr addrspace(1) %in1,
1068                                        ptr addrspace(1) %in2) {
1069  %x = load volatile float, ptr addrspace(1) %in1
1070  %y = load volatile float, ptr addrspace(1) %in2
1071  %a = fadd float %x, 1.0
1072  %m = fmul float %y, %a
1073  store float %m, ptr addrspace(1) %out
1074  ret void
1075}
1076
1077define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
1078; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y:
1079; SI-NOFMA:       ; %bb.0:
1080; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1081; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1082; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1083; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1084; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1085; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1086; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1087; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1088; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1089; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1090; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1091; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1092; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1093; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1094; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1095; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1096; SI-NOFMA-NEXT:    v_add_f32_e32 v0, -1.0, v0
1097; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1098; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
1099; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1100; SI-NOFMA-NEXT:    s_endpgm
1101;
1102; SI-FMA-LABEL: test_f32_mul_add_x_negone_y:
1103; SI-FMA:       ; %bb.0:
1104; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1105; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1106; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1107; SI-FMA-NEXT:    s_mov_b32 s6, -1
1108; SI-FMA-NEXT:    s_mov_b32 s14, s6
1109; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1110; SI-FMA-NEXT:    s_mov_b32 s12, s2
1111; SI-FMA-NEXT:    s_mov_b32 s13, s3
1112; SI-FMA-NEXT:    s_mov_b32 s15, s7
1113; SI-FMA-NEXT:    s_mov_b32 s10, s6
1114; SI-FMA-NEXT:    s_mov_b32 s11, s7
1115; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1116; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1117; SI-FMA-NEXT:    s_mov_b32 s4, s0
1118; SI-FMA-NEXT:    s_mov_b32 s5, s1
1119; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1120; SI-FMA-NEXT:    v_fma_f32 v0, v0, v1, -v1
1121; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1122; SI-FMA-NEXT:    s_endpgm
1123;
1124; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y:
1125; GFX11-NOFMA:       ; %bb.0:
1126; GFX11-NOFMA-NEXT:    s_clause 0x1
1127; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1128; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1129; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1130; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1131; GFX11-NOFMA-NEXT:    s_clause 0x1
1132; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1133; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1134; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1135; GFX11-NOFMA-NEXT:    v_add_f32_e32 v1, -1.0, v1
1136; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1137; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1138; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
1139; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1140; GFX11-NOFMA-NEXT:    s_endpgm
1141;
1142; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y:
1143; GFX11-FMA:       ; %bb.0:
1144; GFX11-FMA-NEXT:    s_clause 0x1
1145; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1146; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1147; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1148; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX11-FMA-NEXT:    s_clause 0x1
1150; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1151; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1152; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1153; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, -v2
1154; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1155; GFX11-FMA-NEXT:    s_endpgm
1156                                           ptr addrspace(1) %in1,
1157                                           ptr addrspace(1) %in2) {
1158  %x = load float, ptr addrspace(1) %in1
1159  %y = load float, ptr addrspace(1) %in2
1160  %a = fadd float %x, -1.0
1161  %m = fmul float %a, %y
1162  store float %m, ptr addrspace(1) %out
1163  ret void
1164}
1165
1166define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
1167; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone:
1168; SI-NOFMA:       ; %bb.0:
1169; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1170; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1171; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1172; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1173; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1174; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1175; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1176; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1177; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1178; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1179; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1180; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1181; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1182; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1183; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1184; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1185; SI-NOFMA-NEXT:    v_add_f32_e32 v0, -1.0, v0
1186; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1187; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v1, v0
1188; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1189; SI-NOFMA-NEXT:    s_endpgm
1190;
1191; SI-FMA-LABEL: test_f32_mul_y_add_x_negone:
1192; SI-FMA:       ; %bb.0:
1193; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1194; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1195; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1196; SI-FMA-NEXT:    s_mov_b32 s6, -1
1197; SI-FMA-NEXT:    s_mov_b32 s14, s6
1198; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1199; SI-FMA-NEXT:    s_mov_b32 s12, s2
1200; SI-FMA-NEXT:    s_mov_b32 s13, s3
1201; SI-FMA-NEXT:    s_mov_b32 s15, s7
1202; SI-FMA-NEXT:    s_mov_b32 s10, s6
1203; SI-FMA-NEXT:    s_mov_b32 s11, s7
1204; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1205; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1206; SI-FMA-NEXT:    s_mov_b32 s4, s0
1207; SI-FMA-NEXT:    s_mov_b32 s5, s1
1208; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1209; SI-FMA-NEXT:    v_fma_f32 v0, v0, v1, -v1
1210; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1211; SI-FMA-NEXT:    s_endpgm
1212;
1213; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone:
1214; GFX11-NOFMA:       ; %bb.0:
1215; GFX11-NOFMA-NEXT:    s_clause 0x1
1216; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1217; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1218; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1219; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1220; GFX11-NOFMA-NEXT:    s_clause 0x1
1221; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1222; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1223; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1224; GFX11-NOFMA-NEXT:    v_add_f32_e32 v1, -1.0, v1
1225; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1226; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1227; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
1228; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1229; GFX11-NOFMA-NEXT:    s_endpgm
1230;
1231; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone:
1232; GFX11-FMA:       ; %bb.0:
1233; GFX11-FMA-NEXT:    s_clause 0x1
1234; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1235; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1236; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1237; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX11-FMA-NEXT:    s_clause 0x1
1239; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1240; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1241; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1242; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, -v2
1243; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1244; GFX11-FMA-NEXT:    s_endpgm
1245                                           ptr addrspace(1) %in1,
1246                                           ptr addrspace(1) %in2) {
1247  %x = load float, ptr addrspace(1) %in1
1248  %y = load float, ptr addrspace(1) %in2
1249  %a = fadd float %x, -1.0
1250  %m = fmul float %y, %a
1251  store float %m, ptr addrspace(1) %out
1252  ret void
1253}
1254
1255define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
1256; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y:
1257; SI-NOFMA:       ; %bb.0:
1258; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1259; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1260; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1261; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1262; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1263; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1264; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1265; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1266; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1267; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1268; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1269; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1270; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1271; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1272; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1273; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1274; SI-NOFMA-NEXT:    v_sub_f32_e32 v0, 1.0, v0
1275; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1276; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
1277; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1278; SI-NOFMA-NEXT:    s_endpgm
1279;
1280; SI-FMA-LABEL: test_f32_mul_sub_one_x_y:
1281; SI-FMA:       ; %bb.0:
1282; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1283; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1284; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1285; SI-FMA-NEXT:    s_mov_b32 s6, -1
1286; SI-FMA-NEXT:    s_mov_b32 s14, s6
1287; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1288; SI-FMA-NEXT:    s_mov_b32 s12, s2
1289; SI-FMA-NEXT:    s_mov_b32 s13, s3
1290; SI-FMA-NEXT:    s_mov_b32 s15, s7
1291; SI-FMA-NEXT:    s_mov_b32 s10, s6
1292; SI-FMA-NEXT:    s_mov_b32 s11, s7
1293; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1294; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1295; SI-FMA-NEXT:    s_mov_b32 s4, s0
1296; SI-FMA-NEXT:    s_mov_b32 s5, s1
1297; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1298; SI-FMA-NEXT:    v_fma_f32 v0, -v0, v1, v1
1299; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1300; SI-FMA-NEXT:    s_endpgm
1301;
1302; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y:
1303; GFX11-NOFMA:       ; %bb.0:
1304; GFX11-NOFMA-NEXT:    s_clause 0x1
1305; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1306; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1307; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1308; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1309; GFX11-NOFMA-NEXT:    s_clause 0x1
1310; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1311; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1312; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1313; GFX11-NOFMA-NEXT:    v_sub_f32_e32 v1, 1.0, v1
1314; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1315; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1316; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
1317; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1318; GFX11-NOFMA-NEXT:    s_endpgm
1319;
1320; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y:
1321; GFX11-FMA:       ; %bb.0:
1322; GFX11-FMA-NEXT:    s_clause 0x1
1323; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1324; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1325; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1326; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1327; GFX11-FMA-NEXT:    s_clause 0x1
1328; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1329; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1330; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1331; GFX11-FMA-NEXT:    v_fma_f32 v1, -v1, v2, v2
1332; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1333; GFX11-FMA-NEXT:    s_endpgm
1334                                        ptr addrspace(1) %in1,
1335                                        ptr addrspace(1) %in2) {
1336  %x = load float, ptr addrspace(1) %in1
1337  %y = load float, ptr addrspace(1) %in2
1338  %s = fsub float 1.0, %x
1339  %m = fmul float %s, %y
1340  store float %m, ptr addrspace(1) %out
1341  ret void
1342}
1343
1344define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
1345; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x:
1346; SI-NOFMA:       ; %bb.0:
1347; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1348; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1349; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1350; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1351; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1352; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1353; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1354; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1355; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1356; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1357; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1358; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1359; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1360; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1361; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1362; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1363; SI-NOFMA-NEXT:    v_sub_f32_e32 v0, 1.0, v0
1364; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1365; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v1, v0
1366; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1367; SI-NOFMA-NEXT:    s_endpgm
1368;
1369; SI-FMA-LABEL: test_f32_mul_y_sub_one_x:
1370; SI-FMA:       ; %bb.0:
1371; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1372; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1373; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1374; SI-FMA-NEXT:    s_mov_b32 s6, -1
1375; SI-FMA-NEXT:    s_mov_b32 s14, s6
1376; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1377; SI-FMA-NEXT:    s_mov_b32 s12, s2
1378; SI-FMA-NEXT:    s_mov_b32 s13, s3
1379; SI-FMA-NEXT:    s_mov_b32 s15, s7
1380; SI-FMA-NEXT:    s_mov_b32 s10, s6
1381; SI-FMA-NEXT:    s_mov_b32 s11, s7
1382; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1383; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1384; SI-FMA-NEXT:    s_mov_b32 s4, s0
1385; SI-FMA-NEXT:    s_mov_b32 s5, s1
1386; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1387; SI-FMA-NEXT:    v_fma_f32 v0, -v0, v1, v1
1388; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1389; SI-FMA-NEXT:    s_endpgm
1390;
1391; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x:
1392; GFX11-NOFMA:       ; %bb.0:
1393; GFX11-NOFMA-NEXT:    s_clause 0x1
1394; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1395; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1396; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1397; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1398; GFX11-NOFMA-NEXT:    s_clause 0x1
1399; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1400; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1401; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1402; GFX11-NOFMA-NEXT:    v_sub_f32_e32 v1, 1.0, v1
1403; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1404; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1405; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
1406; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1407; GFX11-NOFMA-NEXT:    s_endpgm
1408;
1409; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x:
1410; GFX11-FMA:       ; %bb.0:
1411; GFX11-FMA-NEXT:    s_clause 0x1
1412; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1413; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1414; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1415; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX11-FMA-NEXT:    s_clause 0x1
1417; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1418; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1419; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1420; GFX11-FMA-NEXT:    v_fma_f32 v1, -v1, v2, v2
1421; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1422; GFX11-FMA-NEXT:    s_endpgm
1423                                        ptr addrspace(1) %in1,
1424                                        ptr addrspace(1) %in2) {
1425  %x = load float, ptr addrspace(1) %in1
1426  %y = load float, ptr addrspace(1) %in2
1427  %s = fsub float 1.0, %x
1428  %m = fmul float %y, %s
1429  store float %m, ptr addrspace(1) %out
1430  ret void
1431}
1432
1433define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
1434; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y:
1435; SI-NOFMA:       ; %bb.0:
1436; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1437; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1438; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1439; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1440; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1441; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1442; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1443; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1444; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1445; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1446; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1447; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1448; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1449; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1450; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1451; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1452; SI-NOFMA-NEXT:    v_sub_f32_e32 v0, -1.0, v0
1453; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1454; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
1455; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1456; SI-NOFMA-NEXT:    s_endpgm
1457;
1458; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y:
1459; SI-FMA:       ; %bb.0:
1460; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1461; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1462; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1463; SI-FMA-NEXT:    s_mov_b32 s6, -1
1464; SI-FMA-NEXT:    s_mov_b32 s14, s6
1465; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1466; SI-FMA-NEXT:    s_mov_b32 s12, s2
1467; SI-FMA-NEXT:    s_mov_b32 s13, s3
1468; SI-FMA-NEXT:    s_mov_b32 s15, s7
1469; SI-FMA-NEXT:    s_mov_b32 s10, s6
1470; SI-FMA-NEXT:    s_mov_b32 s11, s7
1471; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1472; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1473; SI-FMA-NEXT:    s_mov_b32 s4, s0
1474; SI-FMA-NEXT:    s_mov_b32 s5, s1
1475; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1476; SI-FMA-NEXT:    v_fma_f32 v0, -v0, v1, -v1
1477; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1478; SI-FMA-NEXT:    s_endpgm
1479;
1480; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y:
1481; GFX11-NOFMA:       ; %bb.0:
1482; GFX11-NOFMA-NEXT:    s_clause 0x1
1483; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1484; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1485; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1486; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1487; GFX11-NOFMA-NEXT:    s_clause 0x1
1488; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1489; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1490; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1491; GFX11-NOFMA-NEXT:    v_sub_f32_e32 v1, -1.0, v1
1492; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1493; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1494; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
1495; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1496; GFX11-NOFMA-NEXT:    s_endpgm
1497;
1498; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y:
1499; GFX11-FMA:       ; %bb.0:
1500; GFX11-FMA-NEXT:    s_clause 0x1
1501; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1502; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1503; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1504; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1505; GFX11-FMA-NEXT:    s_clause 0x1
1506; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1507; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1508; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1509; GFX11-FMA-NEXT:    v_fma_f32 v1, -v1, v2, -v2
1510; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1511; GFX11-FMA-NEXT:    s_endpgm
1512                                           ptr addrspace(1) %in1,
1513                                           ptr addrspace(1) %in2) {
1514  %x = load float, ptr addrspace(1) %in1
1515  %y = load float, ptr addrspace(1) %in2
1516  %s = fsub float -1.0, %x
1517  %m = fmul float %s, %y
1518  store float %m, ptr addrspace(1) %out
1519  ret void
1520}
1521
1522define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
1523; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x:
1524; SI-NOFMA:       ; %bb.0:
1525; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1526; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1527; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1528; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1529; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1530; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1531; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1532; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1533; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1534; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1535; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1536; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1537; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1538; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1539; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1540; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1541; SI-NOFMA-NEXT:    v_sub_f32_e32 v0, -1.0, v0
1542; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1543; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v1, v0
1544; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1545; SI-NOFMA-NEXT:    s_endpgm
1546;
1547; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x:
1548; SI-FMA:       ; %bb.0:
1549; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1550; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1551; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1552; SI-FMA-NEXT:    s_mov_b32 s6, -1
1553; SI-FMA-NEXT:    s_mov_b32 s14, s6
1554; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1555; SI-FMA-NEXT:    s_mov_b32 s12, s2
1556; SI-FMA-NEXT:    s_mov_b32 s13, s3
1557; SI-FMA-NEXT:    s_mov_b32 s15, s7
1558; SI-FMA-NEXT:    s_mov_b32 s10, s6
1559; SI-FMA-NEXT:    s_mov_b32 s11, s7
1560; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1561; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1562; SI-FMA-NEXT:    s_mov_b32 s4, s0
1563; SI-FMA-NEXT:    s_mov_b32 s5, s1
1564; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1565; SI-FMA-NEXT:    v_fma_f32 v0, -v0, v1, -v1
1566; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1567; SI-FMA-NEXT:    s_endpgm
1568;
1569; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x:
1570; GFX11-NOFMA:       ; %bb.0:
1571; GFX11-NOFMA-NEXT:    s_clause 0x1
1572; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1573; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1574; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1575; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1576; GFX11-NOFMA-NEXT:    s_clause 0x1
1577; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1578; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1579; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1580; GFX11-NOFMA-NEXT:    v_sub_f32_e32 v1, -1.0, v1
1581; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1582; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1583; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
1584; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1585; GFX11-NOFMA-NEXT:    s_endpgm
1586;
1587; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x:
1588; GFX11-FMA:       ; %bb.0:
1589; GFX11-FMA-NEXT:    s_clause 0x1
1590; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1591; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1592; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1593; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX11-FMA-NEXT:    s_clause 0x1
1595; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1596; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1597; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1598; GFX11-FMA-NEXT:    v_fma_f32 v1, -v1, v2, -v2
1599; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1600; GFX11-FMA-NEXT:    s_endpgm
1601                                         ptr addrspace(1) %in1,
1602                                         ptr addrspace(1) %in2) {
1603  %x = load float, ptr addrspace(1) %in1
1604  %y = load float, ptr addrspace(1) %in2
1605  %s = fsub float -1.0, %x
1606  %m = fmul float %y, %s
1607  store float %m, ptr addrspace(1) %out
1608  ret void
1609}
1610
1611define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
1612; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y:
1613; SI-NOFMA:       ; %bb.0:
1614; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1615; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1616; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1617; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1618; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1619; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1620; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1621; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1622; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1623; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1624; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1625; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1626; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1627; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1628; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1629; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1630; SI-NOFMA-NEXT:    v_add_f32_e32 v0, -1.0, v0
1631; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1632; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
1633; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1634; SI-NOFMA-NEXT:    s_endpgm
1635;
1636; SI-FMA-LABEL: test_f32_mul_sub_x_one_y:
1637; SI-FMA:       ; %bb.0:
1638; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1639; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1640; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1641; SI-FMA-NEXT:    s_mov_b32 s6, -1
1642; SI-FMA-NEXT:    s_mov_b32 s14, s6
1643; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1644; SI-FMA-NEXT:    s_mov_b32 s12, s2
1645; SI-FMA-NEXT:    s_mov_b32 s13, s3
1646; SI-FMA-NEXT:    s_mov_b32 s15, s7
1647; SI-FMA-NEXT:    s_mov_b32 s10, s6
1648; SI-FMA-NEXT:    s_mov_b32 s11, s7
1649; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1650; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1651; SI-FMA-NEXT:    s_mov_b32 s4, s0
1652; SI-FMA-NEXT:    s_mov_b32 s5, s1
1653; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1654; SI-FMA-NEXT:    v_fma_f32 v0, v0, v1, -v1
1655; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1656; SI-FMA-NEXT:    s_endpgm
1657;
1658; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y:
1659; GFX11-NOFMA:       ; %bb.0:
1660; GFX11-NOFMA-NEXT:    s_clause 0x1
1661; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1662; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1663; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1664; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1665; GFX11-NOFMA-NEXT:    s_clause 0x1
1666; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1667; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1668; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1669; GFX11-NOFMA-NEXT:    v_add_f32_e32 v1, -1.0, v1
1670; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1671; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1672; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
1673; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1674; GFX11-NOFMA-NEXT:    s_endpgm
1675;
1676; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y:
1677; GFX11-FMA:       ; %bb.0:
1678; GFX11-FMA-NEXT:    s_clause 0x1
1679; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1680; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1681; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1682; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1683; GFX11-FMA-NEXT:    s_clause 0x1
1684; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1685; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1686; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1687; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, -v2
1688; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1689; GFX11-FMA-NEXT:    s_endpgm
1690                                        ptr addrspace(1) %in1,
1691                                        ptr addrspace(1) %in2) {
1692  %x = load float, ptr addrspace(1) %in1
1693  %y = load float, ptr addrspace(1) %in2
1694  %s = fsub float %x, 1.0
1695  %m = fmul float %s, %y
1696  store float %m, ptr addrspace(1) %out
1697  ret void
1698}
1699
1700define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
1701; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one:
1702; SI-NOFMA:       ; %bb.0:
1703; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1704; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1705; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1706; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1707; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1708; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1709; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1710; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1711; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1712; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1713; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1714; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1715; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1716; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1717; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1718; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1719; SI-NOFMA-NEXT:    v_add_f32_e32 v0, -1.0, v0
1720; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1721; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v1, v0
1722; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1723; SI-NOFMA-NEXT:    s_endpgm
1724;
1725; SI-FMA-LABEL: test_f32_mul_y_sub_x_one:
1726; SI-FMA:       ; %bb.0:
1727; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1728; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1729; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1730; SI-FMA-NEXT:    s_mov_b32 s6, -1
1731; SI-FMA-NEXT:    s_mov_b32 s14, s6
1732; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1733; SI-FMA-NEXT:    s_mov_b32 s12, s2
1734; SI-FMA-NEXT:    s_mov_b32 s13, s3
1735; SI-FMA-NEXT:    s_mov_b32 s15, s7
1736; SI-FMA-NEXT:    s_mov_b32 s10, s6
1737; SI-FMA-NEXT:    s_mov_b32 s11, s7
1738; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1739; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1740; SI-FMA-NEXT:    s_mov_b32 s4, s0
1741; SI-FMA-NEXT:    s_mov_b32 s5, s1
1742; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1743; SI-FMA-NEXT:    v_fma_f32 v0, v0, v1, -v1
1744; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1745; SI-FMA-NEXT:    s_endpgm
1746;
1747; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one:
1748; GFX11-NOFMA:       ; %bb.0:
1749; GFX11-NOFMA-NEXT:    s_clause 0x1
1750; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1751; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1752; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1753; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1754; GFX11-NOFMA-NEXT:    s_clause 0x1
1755; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1756; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1757; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1758; GFX11-NOFMA-NEXT:    v_add_f32_e32 v1, -1.0, v1
1759; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1760; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1761; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
1762; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1763; GFX11-NOFMA-NEXT:    s_endpgm
1764;
1765; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one:
1766; GFX11-FMA:       ; %bb.0:
1767; GFX11-FMA-NEXT:    s_clause 0x1
1768; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1769; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1770; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1771; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1772; GFX11-FMA-NEXT:    s_clause 0x1
1773; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1774; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1775; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1776; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, -v2
1777; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1778; GFX11-FMA-NEXT:    s_endpgm
1779                                      ptr addrspace(1) %in1,
1780                                      ptr addrspace(1) %in2) {
1781  %x = load float, ptr addrspace(1) %in1
1782  %y = load float, ptr addrspace(1) %in2
1783  %s = fsub float %x, 1.0
1784  %m = fmul float %y, %s
1785  store float %m, ptr addrspace(1) %out
1786  ret void
1787}
1788
1789define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
1790; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y:
1791; SI-NOFMA:       ; %bb.0:
1792; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1793; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1794; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1795; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1796; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1797; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1798; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1799; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1800; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1801; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1802; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1803; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1804; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1805; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1806; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1807; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1808; SI-NOFMA-NEXT:    v_add_f32_e32 v0, 1.0, v0
1809; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1810; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
1811; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1812; SI-NOFMA-NEXT:    s_endpgm
1813;
1814; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y:
1815; SI-FMA:       ; %bb.0:
1816; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1817; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1818; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1819; SI-FMA-NEXT:    s_mov_b32 s6, -1
1820; SI-FMA-NEXT:    s_mov_b32 s14, s6
1821; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1822; SI-FMA-NEXT:    s_mov_b32 s12, s2
1823; SI-FMA-NEXT:    s_mov_b32 s13, s3
1824; SI-FMA-NEXT:    s_mov_b32 s15, s7
1825; SI-FMA-NEXT:    s_mov_b32 s10, s6
1826; SI-FMA-NEXT:    s_mov_b32 s11, s7
1827; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1828; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1829; SI-FMA-NEXT:    s_mov_b32 s4, s0
1830; SI-FMA-NEXT:    s_mov_b32 s5, s1
1831; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1832; SI-FMA-NEXT:    v_fma_f32 v0, v0, v1, v1
1833; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1834; SI-FMA-NEXT:    s_endpgm
1835;
1836; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y:
1837; GFX11-NOFMA:       ; %bb.0:
1838; GFX11-NOFMA-NEXT:    s_clause 0x1
1839; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1840; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1841; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1842; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1843; GFX11-NOFMA-NEXT:    s_clause 0x1
1844; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1845; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1846; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1847; GFX11-NOFMA-NEXT:    v_add_f32_e32 v1, 1.0, v1
1848; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1849; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1850; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
1851; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1852; GFX11-NOFMA-NEXT:    s_endpgm
1853;
1854; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y:
1855; GFX11-FMA:       ; %bb.0:
1856; GFX11-FMA-NEXT:    s_clause 0x1
1857; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1858; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1859; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1860; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1861; GFX11-FMA-NEXT:    s_clause 0x1
1862; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1863; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1864; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1865; GFX11-FMA-NEXT:    v_fmac_f32_e32 v2, v1, v2
1866; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[0:1]
1867; GFX11-FMA-NEXT:    s_endpgm
1868                                         ptr addrspace(1) %in1,
1869                                         ptr addrspace(1) %in2) {
1870  %x = load float, ptr addrspace(1) %in1
1871  %y = load float, ptr addrspace(1) %in2
1872  %s = fsub float %x, -1.0
1873  %m = fmul float %s, %y
1874  store float %m, ptr addrspace(1) %out
1875  ret void
1876}
1877
1878define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
1879; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone:
1880; SI-NOFMA:       ; %bb.0:
1881; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1882; SI-NOFMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1883; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
1884; SI-NOFMA-NEXT:    s_mov_b32 s6, -1
1885; SI-NOFMA-NEXT:    s_mov_b32 s14, s6
1886; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1887; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1888; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1889; SI-NOFMA-NEXT:    s_mov_b32 s15, s7
1890; SI-NOFMA-NEXT:    s_mov_b32 s10, s6
1891; SI-NOFMA-NEXT:    s_mov_b32 s11, s7
1892; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1893; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1894; SI-NOFMA-NEXT:    s_mov_b32 s4, s0
1895; SI-NOFMA-NEXT:    s_mov_b32 s5, s1
1896; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1897; SI-NOFMA-NEXT:    v_add_f32_e32 v0, 1.0, v0
1898; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1899; SI-NOFMA-NEXT:    v_mul_f32_e32 v0, v1, v0
1900; SI-NOFMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1901; SI-NOFMA-NEXT:    s_endpgm
1902;
1903; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone:
1904; SI-FMA:       ; %bb.0:
1905; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1906; SI-FMA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1907; SI-FMA-NEXT:    s_mov_b32 s7, 0xf000
1908; SI-FMA-NEXT:    s_mov_b32 s6, -1
1909; SI-FMA-NEXT:    s_mov_b32 s14, s6
1910; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1911; SI-FMA-NEXT:    s_mov_b32 s12, s2
1912; SI-FMA-NEXT:    s_mov_b32 s13, s3
1913; SI-FMA-NEXT:    s_mov_b32 s15, s7
1914; SI-FMA-NEXT:    s_mov_b32 s10, s6
1915; SI-FMA-NEXT:    s_mov_b32 s11, s7
1916; SI-FMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1917; SI-FMA-NEXT:    buffer_load_dword v1, off, s[8:11], 0
1918; SI-FMA-NEXT:    s_mov_b32 s4, s0
1919; SI-FMA-NEXT:    s_mov_b32 s5, s1
1920; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
1921; SI-FMA-NEXT:    v_fma_f32 v0, v0, v1, v1
1922; SI-FMA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1923; SI-FMA-NEXT:    s_endpgm
1924;
1925; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone:
1926; GFX11-NOFMA:       ; %bb.0:
1927; GFX11-NOFMA-NEXT:    s_clause 0x1
1928; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1929; GFX11-NOFMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1930; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
1931; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1932; GFX11-NOFMA-NEXT:    s_clause 0x1
1933; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1934; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1935; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1936; GFX11-NOFMA-NEXT:    v_add_f32_e32 v1, 1.0, v1
1937; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
1938; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1939; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
1940; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1941; GFX11-NOFMA-NEXT:    s_endpgm
1942;
1943; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone:
1944; GFX11-FMA:       ; %bb.0:
1945; GFX11-FMA-NEXT:    s_clause 0x1
1946; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1947; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1948; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1949; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1950; GFX11-FMA-NEXT:    s_clause 0x1
1951; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1952; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1953; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1954; GFX11-FMA-NEXT:    v_fmac_f32_e32 v2, v1, v2
1955; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[0:1]
1956; GFX11-FMA-NEXT:    s_endpgm
1957                                         ptr addrspace(1) %in1,
1958                                         ptr addrspace(1) %in2) {
1959  %x = load float, ptr addrspace(1) %in1
1960  %y = load float, ptr addrspace(1) %in2
1961  %s = fsub float %x, -1.0
1962  %m = fmul float %y, %s
1963  store float %m, ptr addrspace(1) %out
1964  ret void
1965}
1966
1967;
1968; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
1969;
1970
1971define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
1972; SI-NOFMA-LABEL: test_f32_interp:
1973; SI-NOFMA:       ; %bb.0:
1974; SI-NOFMA-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
1975; SI-NOFMA-NEXT:    s_mov_b32 s11, 0xf000
1976; SI-NOFMA-NEXT:    s_mov_b32 s10, -1
1977; SI-NOFMA-NEXT:    s_mov_b32 s14, s10
1978; SI-NOFMA-NEXT:    s_mov_b32 s15, s11
1979; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
1980; SI-NOFMA-NEXT:    s_mov_b32 s16, s4
1981; SI-NOFMA-NEXT:    s_mov_b32 s17, s5
1982; SI-NOFMA-NEXT:    s_mov_b32 s4, s6
1983; SI-NOFMA-NEXT:    s_mov_b32 s5, s7
1984; SI-NOFMA-NEXT:    s_mov_b32 s6, s10
1985; SI-NOFMA-NEXT:    s_mov_b32 s7, s11
1986; SI-NOFMA-NEXT:    s_mov_b32 s12, s2
1987; SI-NOFMA-NEXT:    s_mov_b32 s13, s3
1988; SI-NOFMA-NEXT:    s_mov_b32 s18, s10
1989; SI-NOFMA-NEXT:    s_mov_b32 s19, s11
1990; SI-NOFMA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1991; SI-NOFMA-NEXT:    buffer_load_dword v1, off, s[16:19], 0
1992; SI-NOFMA-NEXT:    buffer_load_dword v2, off, s[12:15], 0
1993; SI-NOFMA-NEXT:    s_mov_b32 s8, s0
1994; SI-NOFMA-NEXT:    s_mov_b32 s9, s1
1995; SI-NOFMA-NEXT:    s_waitcnt vmcnt(2)
1996; SI-NOFMA-NEXT:    v_sub_f32_e32 v3, 1.0, v0
1997; SI-NOFMA-NEXT:    s_waitcnt vmcnt(1)
1998; SI-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v3
1999; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
2000; SI-NOFMA-NEXT:    v_mac_f32_e32 v1, v2, v0
2001; SI-NOFMA-NEXT:    buffer_store_dword v1, off, s[8:11], 0
2002; SI-NOFMA-NEXT:    s_endpgm
2003;
2004; SI-FMA-LABEL: test_f32_interp:
2005; SI-FMA:       ; %bb.0:
2006; SI-FMA-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
2007; SI-FMA-NEXT:    s_mov_b32 s11, 0xf000
2008; SI-FMA-NEXT:    s_mov_b32 s10, -1
2009; SI-FMA-NEXT:    s_mov_b32 s18, s10
2010; SI-FMA-NEXT:    s_mov_b32 s19, s11
2011; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
2012; SI-FMA-NEXT:    s_mov_b32 s16, s4
2013; SI-FMA-NEXT:    s_mov_b32 s17, s5
2014; SI-FMA-NEXT:    s_mov_b32 s14, s10
2015; SI-FMA-NEXT:    s_mov_b32 s12, s2
2016; SI-FMA-NEXT:    s_mov_b32 s13, s3
2017; SI-FMA-NEXT:    s_mov_b32 s15, s11
2018; SI-FMA-NEXT:    s_mov_b32 s4, s6
2019; SI-FMA-NEXT:    s_mov_b32 s5, s7
2020; SI-FMA-NEXT:    s_mov_b32 s6, s10
2021; SI-FMA-NEXT:    s_mov_b32 s7, s11
2022; SI-FMA-NEXT:    buffer_load_dword v0, off, s[16:19], 0
2023; SI-FMA-NEXT:    buffer_load_dword v1, off, s[4:7], 0
2024; SI-FMA-NEXT:    buffer_load_dword v2, off, s[12:15], 0
2025; SI-FMA-NEXT:    s_mov_b32 s8, s0
2026; SI-FMA-NEXT:    s_mov_b32 s9, s1
2027; SI-FMA-NEXT:    s_waitcnt vmcnt(1)
2028; SI-FMA-NEXT:    v_fma_f32 v0, -v1, v0, v0
2029; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
2030; SI-FMA-NEXT:    v_fma_f32 v0, v2, v1, v0
2031; SI-FMA-NEXT:    buffer_store_dword v0, off, s[8:11], 0
2032; SI-FMA-NEXT:    s_endpgm
2033;
2034; GFX11-NOFMA-LABEL: test_f32_interp:
2035; GFX11-NOFMA:       ; %bb.0:
2036; GFX11-NOFMA-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2037; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v0, 0
2038; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
2039; GFX11-NOFMA-NEXT:    s_clause 0x2
2040; GFX11-NOFMA-NEXT:    global_load_b32 v1, v0, s[6:7]
2041; GFX11-NOFMA-NEXT:    global_load_b32 v2, v0, s[4:5]
2042; GFX11-NOFMA-NEXT:    global_load_b32 v3, v0, s[2:3]
2043; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(2)
2044; GFX11-NOFMA-NEXT:    v_sub_f32_e32 v4, 1.0, v1
2045; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
2046; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2047; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v2, v2, v4
2048; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
2049; GFX11-NOFMA-NEXT:    v_fmac_f32_e32 v2, v3, v1
2050; GFX11-NOFMA-NEXT:    global_store_b32 v0, v2, s[0:1]
2051; GFX11-NOFMA-NEXT:    s_endpgm
2052;
2053; GFX11-FMA-LABEL: test_f32_interp:
2054; GFX11-FMA:       ; %bb.0:
2055; GFX11-FMA-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2056; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
2057; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
2058; GFX11-FMA-NEXT:    s_clause 0x2
2059; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[4:5]
2060; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[6:7]
2061; GFX11-FMA-NEXT:    global_load_b32 v3, v0, s[2:3]
2062; GFX11-FMA-NEXT:    s_waitcnt vmcnt(1)
2063; GFX11-FMA-NEXT:    v_fma_f32 v1, -v2, v1, v1
2064; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
2065; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2066; GFX11-FMA-NEXT:    v_fmac_f32_e32 v1, v3, v2
2067; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
2068; GFX11-FMA-NEXT:    s_endpgm
2069                             ptr addrspace(1) %in1,
2070                             ptr addrspace(1) %in2,
2071                             ptr addrspace(1) %in3) {
2072  %x = load float, ptr addrspace(1) %in1
2073  %y = load float, ptr addrspace(1) %in2
2074  %t = load float, ptr addrspace(1) %in3
2075  %t1 = fsub float 1.0, %t
2076  %tx = fmul float %x, %t
2077  %ty = fmul float %y, %t1
2078  %r = fadd float %tx, %ty
2079  store float %r, ptr addrspace(1) %out
2080  ret void
2081}
2082
2083define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
2084; SI-FMA-LABEL: test_f64_interp:
2085; SI-FMA:       ; %bb.0:
2086; SI-FMA-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
2087; SI-FMA-NEXT:    s_mov_b32 s11, 0xf000
2088; SI-FMA-NEXT:    s_mov_b32 s10, -1
2089; SI-FMA-NEXT:    s_mov_b32 s18, s10
2090; SI-FMA-NEXT:    s_mov_b32 s19, s11
2091; SI-FMA-NEXT:    s_waitcnt lgkmcnt(0)
2092; SI-FMA-NEXT:    s_mov_b32 s16, s4
2093; SI-FMA-NEXT:    s_mov_b32 s17, s5
2094; SI-FMA-NEXT:    s_mov_b32 s4, s6
2095; SI-FMA-NEXT:    s_mov_b32 s5, s7
2096; SI-FMA-NEXT:    s_mov_b32 s6, s10
2097; SI-FMA-NEXT:    s_mov_b32 s7, s11
2098; SI-FMA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
2099; SI-FMA-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
2100; SI-FMA-NEXT:    s_mov_b32 s14, s10
2101; SI-FMA-NEXT:    s_mov_b32 s12, s2
2102; SI-FMA-NEXT:    s_mov_b32 s13, s3
2103; SI-FMA-NEXT:    s_mov_b32 s15, s11
2104; SI-FMA-NEXT:    buffer_load_dwordx2 v[4:5], off, s[12:15], 0
2105; SI-FMA-NEXT:    s_mov_b32 s8, s0
2106; SI-FMA-NEXT:    s_mov_b32 s9, s1
2107; SI-FMA-NEXT:    s_waitcnt vmcnt(1)
2108; SI-FMA-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
2109; SI-FMA-NEXT:    s_waitcnt vmcnt(0)
2110; SI-FMA-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
2111; SI-FMA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2112; SI-FMA-NEXT:    s_endpgm
2113;
2114; GFX11-NOFMA-LABEL: test_f64_interp:
2115; GFX11-NOFMA:       ; %bb.0:
2116; GFX11-NOFMA-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2117; GFX11-NOFMA-NEXT:    v_mov_b32_e32 v8, 0
2118; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
2119; GFX11-NOFMA-NEXT:    s_clause 0x2
2120; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v8, s[6:7]
2121; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v8, s[4:5]
2122; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v8, s[2:3]
2123; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(2)
2124; GFX11-NOFMA-NEXT:    v_add_f64 v[6:7], -v[0:1], 1.0
2125; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(1)
2126; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2127; GFX11-NOFMA-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
2128; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
2129; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
2130; GFX11-NOFMA-NEXT:    global_store_b64 v8, v[0:1], s[0:1]
2131; GFX11-NOFMA-NEXT:    s_endpgm
2132;
2133; GFX11-FMA-LABEL: test_f64_interp:
2134; GFX11-FMA:       ; %bb.0:
2135; GFX11-FMA-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2136; GFX11-FMA-NEXT:    v_mov_b32_e32 v6, 0
2137; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
2138; GFX11-FMA-NEXT:    s_clause 0x2
2139; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v6, s[4:5]
2140; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v6, s[6:7]
2141; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v6, s[2:3]
2142; GFX11-FMA-NEXT:    s_waitcnt vmcnt(1)
2143; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
2144; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
2145; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2146; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
2147; GFX11-FMA-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
2148; GFX11-FMA-NEXT:    s_endpgm
2149                             ptr addrspace(1) %in1,
2150                             ptr addrspace(1) %in2,
2151                             ptr addrspace(1) %in3) {
2152  %x = load double, ptr addrspace(1) %in1
2153  %y = load double, ptr addrspace(1) %in2
2154  %t = load double, ptr addrspace(1) %in3
2155  %t1 = fsub double 1.0, %t
2156  %tx = fmul double %x, %t
2157  %ty = fmul double %y, %t1
2158  %r = fadd double %tx, %ty
2159  store double %r, ptr addrspace(1) %out
2160  ret void
2161}
2162
2163; Make sure negative constant cancels out fneg
2164define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2165; SI-LABEL: fma_neg_2.0_neg_a_b_f32:
2166; SI:       ; %bb.0:
2167; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2168; SI-NEXT:    s_mov_b32 s3, 0xf000
2169; SI-NEXT:    s_mov_b32 s2, 0
2170; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2171; SI-NEXT:    v_mov_b32_e32 v1, 0
2172; SI-NEXT:    s_waitcnt lgkmcnt(0)
2173; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
2174; SI-NEXT:    s_waitcnt vmcnt(0)
2175; SI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
2176; SI-NEXT:    s_waitcnt vmcnt(0)
2177; SI-NEXT:    v_fma_f32 v2, v2, 2.0, v3
2178; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2179; SI-NEXT:    s_endpgm
2180;
2181; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32:
2182; GFX11:       ; %bb.0:
2183; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2184; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2185; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2186; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2187; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2188; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
2189; GFX11-NEXT:    s_waitcnt vmcnt(0)
2190; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
2191; GFX11-NEXT:    s_waitcnt vmcnt(0)
2192; GFX11-NEXT:    v_fmac_f32_e32 v2, 2.0, v1
2193; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
2194; GFX11-NEXT:    s_endpgm
2195  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2196  %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
2197  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
2198  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
2199
2200  %r1 = load volatile float, ptr addrspace(1) %gep.0
2201  %r2 = load volatile float, ptr addrspace(1) %gep.1
2202
2203  %r1.fneg = fneg float %r1
2204
2205  %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2)
2206  store float %r3, ptr addrspace(1) %gep.out
2207  ret void
2208}
2209
2210define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2211; SI-LABEL: fma_2.0_neg_a_b_f32:
2212; SI:       ; %bb.0:
2213; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2214; SI-NEXT:    s_mov_b32 s3, 0xf000
2215; SI-NEXT:    s_mov_b32 s2, 0
2216; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2217; SI-NEXT:    v_mov_b32_e32 v1, 0
2218; SI-NEXT:    s_waitcnt lgkmcnt(0)
2219; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
2220; SI-NEXT:    s_waitcnt vmcnt(0)
2221; SI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
2222; SI-NEXT:    s_waitcnt vmcnt(0)
2223; SI-NEXT:    v_fma_f32 v2, v2, -2.0, v3
2224; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2225; SI-NEXT:    s_endpgm
2226;
2227; GFX11-LABEL: fma_2.0_neg_a_b_f32:
2228; GFX11:       ; %bb.0:
2229; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2230; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2231; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2232; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2233; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2234; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
2235; GFX11-NEXT:    s_waitcnt vmcnt(0)
2236; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
2237; GFX11-NEXT:    s_waitcnt vmcnt(0)
2238; GFX11-NEXT:    v_fmac_f32_e32 v2, -2.0, v1
2239; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
2240; GFX11-NEXT:    s_endpgm
2241  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2242  %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
2243  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
2244  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
2245
2246  %r1 = load volatile float, ptr addrspace(1) %gep.0
2247  %r2 = load volatile float, ptr addrspace(1) %gep.1
2248
2249  %r1.fneg = fneg float %r1
2250
2251  %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2)
2252  store float %r3, ptr addrspace(1) %gep.out
2253  ret void
2254}
2255
2256define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 {
2257; SI-LABEL: fma_neg_b_c_v4f32:
2258; SI:       ; %bb.0:
2259; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2260; SI-NEXT:    s_mov_b32 s7, 0xf000
2261; SI-NEXT:    s_mov_b32 s6, 0
2262; SI-NEXT:    v_lshlrev_b32_e32 v12, 4, v0
2263; SI-NEXT:    v_mov_b32_e32 v13, 0
2264; SI-NEXT:    s_waitcnt lgkmcnt(0)
2265; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
2266; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[12:13], s[4:7], 0 addr64
2267; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 offset:16
2268; SI-NEXT:    buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:48
2269; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
2270; SI-NEXT:    s_waitcnt vmcnt(0)
2271; SI-NEXT:    v_fma_f32 v3, v11, -v3, -v7
2272; SI-NEXT:    v_fma_f32 v2, v10, -v2, -v6
2273; SI-NEXT:    v_fma_f32 v1, v9, -v1, -v5
2274; SI-NEXT:    v_fma_f32 v0, v8, -v0, -v4
2275; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[12:13], s[0:3], 0 addr64
2276; SI-NEXT:    s_endpgm
2277;
2278; GFX11-LABEL: fma_neg_b_c_v4f32:
2279; GFX11:       ; %bb.0:
2280; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2281; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2282; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2283; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 4, v0
2284; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2285; GFX11-NEXT:    s_clause 0x2
2286; GFX11-NEXT:    global_load_b128 v[0:3], v12, s[2:3] offset:16
2287; GFX11-NEXT:    global_load_b128 v[4:7], v12, s[2:3]
2288; GFX11-NEXT:    global_load_b128 v[8:11], v12, s[2:3] offset:48
2289; GFX11-NEXT:    s_waitcnt vmcnt(0)
2290; GFX11-NEXT:    v_fma_f32 v3, v11, -v7, -v3
2291; GFX11-NEXT:    v_fma_f32 v2, v10, -v6, -v2
2292; GFX11-NEXT:    v_fma_f32 v1, v9, -v5, -v1
2293; GFX11-NEXT:    v_fma_f32 v0, v8, -v4, -v0
2294; GFX11-NEXT:    global_store_b128 v12, v[0:3], s[0:1]
2295; GFX11-NEXT:    s_endpgm
2296  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2297  %gep.0 = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %tid
2298  %gep.1 = getelementptr <4 x float>, ptr addrspace(1) %gep.0, i32 1
2299  %gep.2 = getelementptr <4 x float>, ptr addrspace(1) %gep.1, i32 2
2300  %gep.out = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid
2301
2302  %tmp0 = load <4 x float>, ptr addrspace(1) %gep.0
2303  %tmp1 = load <4 x float>, ptr addrspace(1) %gep.1
2304  %tmp2 = load <4 x float>, ptr addrspace(1) %gep.2
2305
2306  %fneg0 = fneg fast <4 x float> %tmp0
2307  %fneg1 = fneg fast <4 x float> %tmp1
2308  %fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1)
2309
2310  store <4 x float> %fma0, ptr addrspace(1) %gep.out
2311  ret void
2312}
2313
2314attributes #0 = { nounwind readnone }
2315attributes #1 = { nounwind }
2316attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }
2317