xref: /llvm-project/llvm/test/CodeGen/AMDGPU/madak.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-MAD %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-MAD %s
7
8; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX940-FMA %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-FMA %s
10; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FMA %s
11
12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
13declare float @llvm.fabs.f32(float) nounwind readnone
14
15define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
16; GFX6-LABEL: madak_f32:
17; GFX6:       ; %bb.0:
18; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
19; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
20; GFX6-NEXT:    s_mov_b32 s7, 0xf000
21; GFX6-NEXT:    s_mov_b32 s6, 0
22; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
23; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
24; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
25; GFX6-NEXT:    v_mov_b32_e32 v1, 0
26; GFX6-NEXT:    s_mov_b64 s[10:11], s[6:7]
27; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
28; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
29; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
30; GFX6-NEXT:    s_waitcnt vmcnt(0)
31; GFX6-NEXT:    v_madak_f32 v2, v2, v3, 0x41200000
32; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
33; GFX6-NEXT:    s_endpgm
34;
35; GFX8-LABEL: madak_f32:
36; GFX8:       ; %bb.0:
37; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
38; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
39; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
40; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX8-NEXT:    v_mov_b32_e32 v1, s3
42; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
43; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
44; GFX8-NEXT:    v_mov_b32_e32 v3, s5
45; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
46; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
47; GFX8-NEXT:    flat_load_dword v5, v[0:1]
48; GFX8-NEXT:    flat_load_dword v2, v[2:3]
49; GFX8-NEXT:    v_mov_b32_e32 v1, s1
50; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
51; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
52; GFX8-NEXT:    s_waitcnt vmcnt(0)
53; GFX8-NEXT:    v_madak_f32 v2, v5, v2, 0x41200000
54; GFX8-NEXT:    flat_store_dword v[0:1], v2
55; GFX8-NEXT:    s_endpgm
56;
57; GFX9-LABEL: madak_f32:
58; GFX9:       ; %bb.0:
59; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
60; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
61; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
62; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
63; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
64; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
65; GFX9-NEXT:    s_waitcnt vmcnt(0)
66; GFX9-NEXT:    v_madak_f32 v1, v1, v2, 0x41200000
67; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
68; GFX9-NEXT:    s_endpgm
69;
70; GFX10-MAD-LABEL: madak_f32:
71; GFX10-MAD:       ; %bb.0:
72; GFX10-MAD-NEXT:    s_clause 0x1
73; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
74; GFX10-MAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
75; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
76; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX10-MAD-NEXT:    s_clause 0x1
78; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
79; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[6:7]
80; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
81; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v2, 0x41200000
82; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
83; GFX10-MAD-NEXT:    s_endpgm
84;
85; GFX11-MAD-LABEL: madak_f32:
86; GFX11-MAD:       ; %bb.0:
87; GFX11-MAD-NEXT:    s_clause 0x1
88; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
89; GFX11-MAD-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
90; GFX11-MAD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
91; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
92; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
93; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX11-MAD-NEXT:    s_clause 0x1
95; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
96; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[4:5]
97; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
98; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, v1, v2
99; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
100; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
101; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
102; GFX11-MAD-NEXT:    s_endpgm
103;
104; GFX940-FMA-LABEL: madak_f32:
105; GFX940-FMA:       ; %bb.0:
106; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
107; GFX940-FMA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
108; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
109; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
110; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
112; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[6:7]
113; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
114; GFX940-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
115; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
116; GFX940-FMA-NEXT:    s_endpgm
117;
118; GFX10-FMA-LABEL: madak_f32:
119; GFX10-FMA:       ; %bb.0:
120; GFX10-FMA-NEXT:    s_clause 0x1
121; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
122; GFX10-FMA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
123; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
124; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX10-FMA-NEXT:    s_clause 0x1
126; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
127; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[6:7]
128; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
129; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
130; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
131; GFX10-FMA-NEXT:    s_endpgm
132;
133; GFX11-FMA-LABEL: madak_f32:
134; GFX11-FMA:       ; %bb.0:
135; GFX11-FMA-NEXT:    s_clause 0x1
136; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
137; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
138; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
139; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
140; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
141; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
142; GFX11-FMA-NEXT:    s_clause 0x1
143; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
144; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
145; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
146; GFX11-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
147; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
148; GFX11-FMA-NEXT:    s_endpgm
149  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
150  %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
151  %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
152  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
153
154  %a = load float, ptr addrspace(1) %in.a.gep, align 4
155  %b = load float, ptr addrspace(1) %in.b.gep, align 4
156
157  %mul = fmul float %a, %b
158  %madak = fadd float %mul, 10.0
159  store float %madak, ptr addrspace(1) %out.gep, align 4
160  ret void
161}
162
163; Make sure this is only folded with one use. This is a code size
164; optimization and if we fold the immediate multiple times, we'll undo
165; it.
166define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
167; GFX6-LABEL: madak_2_use_f32:
168; GFX6:       ; %bb.0:
169; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
170; GFX6-NEXT:    s_mov_b32 s7, 0xf000
171; GFX6-NEXT:    s_mov_b32 s6, 0
172; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
173; GFX6-NEXT:    v_mov_b32_e32 v1, 0
174; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
176; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
177; GFX6-NEXT:    s_waitcnt vmcnt(0)
178; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
179; GFX6-NEXT:    s_waitcnt vmcnt(0)
180; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
181; GFX6-NEXT:    s_waitcnt vmcnt(0)
182; GFX6-NEXT:    v_mov_b32_e32 v5, 0x41200000
183; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
184; GFX6-NEXT:    v_madak_f32 v3, v2, v3, 0x41200000
185; GFX6-NEXT:    v_mac_f32_e32 v5, v2, v4
186; GFX6-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
187; GFX6-NEXT:    s_waitcnt vmcnt(0)
188; GFX6-NEXT:    buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 offset:4
189; GFX6-NEXT:    s_waitcnt vmcnt(0)
190; GFX6-NEXT:    s_endpgm
191;
192; GFX8-LABEL: madak_2_use_f32:
193; GFX8:       ; %bb.0:
194; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
195; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
196; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
197; GFX8-NEXT:    v_mov_b32_e32 v1, s3
198; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
199; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
200; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
201; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
202; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 8, v0
203; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
204; GFX8-NEXT:    flat_load_dword v7, v[0:1] glc
205; GFX8-NEXT:    s_waitcnt vmcnt(0)
206; GFX8-NEXT:    flat_load_dword v8, v[2:3] glc
207; GFX8-NEXT:    s_waitcnt vmcnt(0)
208; GFX8-NEXT:    flat_load_dword v4, v[4:5] glc
209; GFX8-NEXT:    s_waitcnt vmcnt(0)
210; GFX8-NEXT:    v_mov_b32_e32 v1, s1
211; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
212; GFX8-NEXT:    v_mov_b32_e32 v5, 0x41200000
213; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
214; GFX8-NEXT:    v_madak_f32 v6, v7, v8, 0x41200000
215; GFX8-NEXT:    v_mac_f32_e32 v5, v7, v4
216; GFX8-NEXT:    flat_store_dword v[0:1], v6
217; GFX8-NEXT:    s_waitcnt vmcnt(0)
218; GFX8-NEXT:    flat_store_dword v[2:3], v5
219; GFX8-NEXT:    s_waitcnt vmcnt(0)
220; GFX8-NEXT:    s_endpgm
221;
222; GFX9-LABEL: madak_2_use_f32:
223; GFX9:       ; %bb.0:
224; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
225; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
226; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41200000
227; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
229; GFX9-NEXT:    s_waitcnt vmcnt(0)
230; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
231; GFX9-NEXT:    s_waitcnt vmcnt(0)
232; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc
233; GFX9-NEXT:    s_waitcnt vmcnt(0)
234; GFX9-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
235; GFX9-NEXT:    v_mac_f32_e32 v4, v1, v3
236; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
237; GFX9-NEXT:    s_waitcnt vmcnt(0)
238; GFX9-NEXT:    global_store_dword v0, v4, s[2:3] offset:4
239; GFX9-NEXT:    s_waitcnt vmcnt(0)
240; GFX9-NEXT:    s_endpgm
241;
242; GFX10-MAD-LABEL: madak_2_use_f32:
243; GFX10-MAD:       ; %bb.0:
244; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
245; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
246; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
247; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
248; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
249; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
250; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
251; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
252; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
253; GFX10-MAD-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
254; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v3, 0x41200000
255; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[0:1]
256; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
257; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
258; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
259; GFX10-MAD-NEXT:    s_endpgm
260;
261; GFX11-MAD-LABEL: madak_2_use_f32:
262; GFX11-MAD:       ; %bb.0:
263; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
264; GFX11-MAD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
265; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
266; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
267; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
269; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
270; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
271; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
272; GFX11-MAD-NEXT:    global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
273; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
274; GFX11-MAD-NEXT:    v_mul_f32_e32 v2, v1, v2
275; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
276; GFX11-MAD-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_add_f32 v2, 0x41200000, v2
277; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
278; GFX11-MAD-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
279; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
280; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
281; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
282; GFX11-MAD-NEXT:    s_endpgm
283;
284; GFX940-FMA-LABEL: madak_2_use_f32:
285; GFX940-FMA:       ; %bb.0:
286; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
287; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
288; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
289; GFX940-FMA-NEXT:    v_mov_b32_e32 v4, 0x41200000
290; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
291; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
292; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
293; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1
294; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
295; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1
296; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
297; GFX940-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
298; GFX940-FMA-NEXT:    v_fmac_f32_e32 v4, v1, v3
299; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
300; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
301; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1
302; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
303; GFX940-FMA-NEXT:    s_endpgm
304;
305; GFX10-FMA-LABEL: madak_2_use_f32:
306; GFX10-FMA:       ; %bb.0:
307; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
308; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
309; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
311; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
312; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
313; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
314; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
315; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
316; GFX10-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
317; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
318; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[0:1]
319; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
320; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
321; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
322; GFX10-FMA-NEXT:    s_endpgm
323;
324; GFX11-FMA-LABEL: madak_2_use_f32:
325; GFX11-FMA:       ; %bb.0:
326; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
327; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
328; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
329; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
330; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
332; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
333; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
334; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
335; GFX11-FMA-NEXT:    global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
336; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
337; GFX11-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
338; GFX11-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
339; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
340; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
341; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
342; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
343; GFX11-FMA-NEXT:    s_endpgm
344  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
345
346  %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
347  %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
348  %in.gep.2 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 2
349
350  %out.gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
351  %out.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
352
353  %a = load volatile float, ptr addrspace(1) %in.gep.0, align 4
354  %b = load volatile float, ptr addrspace(1) %in.gep.1, align 4
355  %c = load volatile float, ptr addrspace(1) %in.gep.2, align 4
356
357  %mul0 = fmul float %a, %b
358  %mul1 = fmul float %a, %c
359  %madak0 = fadd float %mul0, 10.0
360  %madak1 = fadd float %mul1, 10.0
361
362  store volatile float %madak0, ptr addrspace(1) %out.gep.0, align 4
363  store volatile float %madak1, ptr addrspace(1) %out.gep.1, align 4
364  ret void
365}
366
367define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 {
368; GFX6-LABEL: madak_m_inline_imm_f32:
369; GFX6:       ; %bb.0:
370; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
371; GFX6-NEXT:    s_mov_b32 s7, 0xf000
372; GFX6-NEXT:    s_mov_b32 s6, 0
373; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
374; GFX6-NEXT:    v_mov_b32_e32 v1, 0
375; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
376; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
377; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
378; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
379; GFX6-NEXT:    s_waitcnt vmcnt(0)
380; GFX6-NEXT:    v_madak_f32 v2, 4.0, v2, 0x41200000
381; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
382; GFX6-NEXT:    s_endpgm
383;
384; GFX8-LABEL: madak_m_inline_imm_f32:
385; GFX8:       ; %bb.0:
386; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
387; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
388; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX8-NEXT:    v_mov_b32_e32 v1, s3
390; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
391; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
392; GFX8-NEXT:    flat_load_dword v3, v[0:1]
393; GFX8-NEXT:    v_mov_b32_e32 v1, s1
394; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
395; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
396; GFX8-NEXT:    s_waitcnt vmcnt(0)
397; GFX8-NEXT:    v_madak_f32 v2, 4.0, v3, 0x41200000
398; GFX8-NEXT:    flat_store_dword v[0:1], v2
399; GFX8-NEXT:    s_endpgm
400;
401; GFX9-LABEL: madak_m_inline_imm_f32:
402; GFX9:       ; %bb.0:
403; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
404; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
405; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
406; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
407; GFX9-NEXT:    s_waitcnt vmcnt(0)
408; GFX9-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
409; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
410; GFX9-NEXT:    s_endpgm
411;
412; GFX10-MAD-LABEL: madak_m_inline_imm_f32:
413; GFX10-MAD:       ; %bb.0:
414; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
415; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
416; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
417; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
418; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
419; GFX10-MAD-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
420; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
421; GFX10-MAD-NEXT:    s_endpgm
422;
423; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
424; GFX11-MAD:       ; %bb.0:
425; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
426; GFX11-MAD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
427; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
428; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
429; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
431; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
432; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, 4.0, v1
433; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
434; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
435; GFX11-MAD-NEXT:    s_endpgm
436;
437; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
438; GFX940-FMA:       ; %bb.0:
439; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
440; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
441; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
442; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
443; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
444; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
445; GFX940-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
446; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
447; GFX940-FMA-NEXT:    s_endpgm
448;
449; GFX10-FMA-LABEL: madak_m_inline_imm_f32:
450; GFX10-FMA:       ; %bb.0:
451; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
452; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
453; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
454; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
455; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
456; GFX10-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
457; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
458; GFX10-FMA-NEXT:    s_endpgm
459;
460; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
461; GFX11-FMA:       ; %bb.0:
462; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
463; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
464; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
465; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
466; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
467; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
468; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
469; GFX11-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
470; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
471; GFX11-FMA-NEXT:    s_endpgm
472  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
473  %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
474  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
475
476  %a = load float, ptr addrspace(1) %in.a.gep, align 4
477
478  %mul = fmul float 4.0, %a
479  %madak = fadd float %mul, 10.0
480  store float %madak, ptr addrspace(1) %out.gep, align 4
481  ret void
482}
483
484; Make sure nothing weird happens with a value that is also allowed as
485; an inline immediate.
486define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
487; GFX6-LABEL: madak_inline_imm_f32:
488; GFX6:       ; %bb.0:
489; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
490; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
491; GFX6-NEXT:    s_mov_b32 s7, 0xf000
492; GFX6-NEXT:    s_mov_b32 s6, 0
493; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
494; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
495; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
496; GFX6-NEXT:    v_mov_b32_e32 v1, 0
497; GFX6-NEXT:    s_mov_b64 s[10:11], s[6:7]
498; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
499; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
500; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
501; GFX6-NEXT:    s_waitcnt vmcnt(0)
502; GFX6-NEXT:    v_mad_f32 v2, v2, v3, 4.0
503; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
504; GFX6-NEXT:    s_endpgm
505;
506; GFX8-LABEL: madak_inline_imm_f32:
507; GFX8:       ; %bb.0:
508; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
509; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
510; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
511; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX8-NEXT:    v_mov_b32_e32 v1, s3
513; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
514; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
515; GFX8-NEXT:    v_mov_b32_e32 v3, s5
516; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
517; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
518; GFX8-NEXT:    flat_load_dword v5, v[0:1]
519; GFX8-NEXT:    flat_load_dword v2, v[2:3]
520; GFX8-NEXT:    v_mov_b32_e32 v1, s1
521; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
522; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
523; GFX8-NEXT:    s_waitcnt vmcnt(0)
524; GFX8-NEXT:    v_mad_f32 v2, v5, v2, 4.0
525; GFX8-NEXT:    flat_store_dword v[0:1], v2
526; GFX8-NEXT:    s_endpgm
527;
528; GFX9-LABEL: madak_inline_imm_f32:
529; GFX9:       ; %bb.0:
530; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
531; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
532; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
533; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
535; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
536; GFX9-NEXT:    s_waitcnt vmcnt(0)
537; GFX9-NEXT:    v_mad_f32 v1, v1, v2, 4.0
538; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
539; GFX9-NEXT:    s_endpgm
540;
541; GFX10-MAD-LABEL: madak_inline_imm_f32:
542; GFX10-MAD:       ; %bb.0:
543; GFX10-MAD-NEXT:    s_clause 0x1
544; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
545; GFX10-MAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
546; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
547; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX10-MAD-NEXT:    s_clause 0x1
549; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
550; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[6:7]
551; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
552; GFX10-MAD-NEXT:    v_mad_f32 v1, v1, v2, 4.0
553; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
554; GFX10-MAD-NEXT:    s_endpgm
555;
556; GFX11-MAD-LABEL: madak_inline_imm_f32:
557; GFX11-MAD:       ; %bb.0:
558; GFX11-MAD-NEXT:    s_clause 0x1
559; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
560; GFX11-MAD-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
561; GFX11-MAD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
562; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
563; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
564; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX11-MAD-NEXT:    s_clause 0x1
566; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
567; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[4:5]
568; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
569; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, v1, v2
570; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
571; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 4.0, v1
572; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
573; GFX11-MAD-NEXT:    s_endpgm
574;
575; GFX940-FMA-LABEL: madak_inline_imm_f32:
576; GFX940-FMA:       ; %bb.0:
577; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
578; GFX940-FMA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
579; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
580; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
581; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
583; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[6:7]
584; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
585; GFX940-FMA-NEXT:    v_fma_f32 v1, v1, v2, 4.0
586; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
587; GFX940-FMA-NEXT:    s_endpgm
588;
589; GFX10-FMA-LABEL: madak_inline_imm_f32:
590; GFX10-FMA:       ; %bb.0:
591; GFX10-FMA-NEXT:    s_clause 0x1
592; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
593; GFX10-FMA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
594; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
595; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
596; GFX10-FMA-NEXT:    s_clause 0x1
597; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
598; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[6:7]
599; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
600; GFX10-FMA-NEXT:    v_fma_f32 v1, v1, v2, 4.0
601; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
602; GFX10-FMA-NEXT:    s_endpgm
603;
604; GFX11-FMA-LABEL: madak_inline_imm_f32:
605; GFX11-FMA:       ; %bb.0:
606; GFX11-FMA-NEXT:    s_clause 0x1
607; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
608; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
609; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
610; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
611; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
612; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
613; GFX11-FMA-NEXT:    s_clause 0x1
614; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
615; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
616; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
617; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, 4.0
618; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
619; GFX11-FMA-NEXT:    s_endpgm
620  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
621  %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
622  %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
623  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
624
625  %a = load float, ptr addrspace(1) %in.a.gep, align 4
626  %b = load float, ptr addrspace(1) %in.b.gep, align 4
627
628  %mul = fmul float %a, %b
629  %madak = fadd float %mul, 4.0
630  store float %madak, ptr addrspace(1) %out.gep, align 4
631  ret void
632}
633
634; We can't use an SGPR when forming madak
635define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 {
636; GFX6-LABEL: s_v_madak_f32:
637; GFX6:       ; %bb.0:
638; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
639; GFX6-NEXT:    s_load_dword s8, s[4:5], 0xd
640; GFX6-NEXT:    s_mov_b32 s7, 0xf000
641; GFX6-NEXT:    s_mov_b32 s6, 0
642; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
643; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
644; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
645; GFX6-NEXT:    v_mov_b32_e32 v1, 0
646; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
647; GFX6-NEXT:    v_mov_b32_e32 v3, 0x41200000
648; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
649; GFX6-NEXT:    s_waitcnt vmcnt(0)
650; GFX6-NEXT:    v_mac_f32_e32 v3, s8, v2
651; GFX6-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
652; GFX6-NEXT:    s_endpgm
653;
654; GFX8-LABEL: s_v_madak_f32:
655; GFX8:       ; %bb.0:
656; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
657; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x34
658; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
659; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
660; GFX8-NEXT:    v_mov_b32_e32 v1, s3
661; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
662; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
663; GFX8-NEXT:    flat_load_dword v3, v[0:1]
664; GFX8-NEXT:    v_mov_b32_e32 v1, s1
665; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
666; GFX8-NEXT:    v_mov_b32_e32 v2, 0x41200000
667; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
668; GFX8-NEXT:    s_waitcnt vmcnt(0)
669; GFX8-NEXT:    v_mac_f32_e32 v2, s4, v3
670; GFX8-NEXT:    flat_store_dword v[0:1], v2
671; GFX8-NEXT:    s_endpgm
672;
673; GFX9-LABEL: s_v_madak_f32:
674; GFX9:       ; %bb.0:
675; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
676; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x34
677; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
678; GFX9-NEXT:    v_mov_b32_e32 v2, 0x41200000
679; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
680; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
681; GFX9-NEXT:    s_waitcnt vmcnt(0)
682; GFX9-NEXT:    v_mac_f32_e32 v2, s6, v1
683; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
684; GFX9-NEXT:    s_endpgm
685;
686; GFX10-MAD-LABEL: s_v_madak_f32:
687; GFX10-MAD:       ; %bb.0:
688; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
689; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
690; GFX10-MAD-NEXT:    s_load_dword s4, s[4:5], 0x34
691; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
692; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
693; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
694; GFX10-MAD-NEXT:    v_madak_f32 v1, s4, v1, 0x41200000
695; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
696; GFX10-MAD-NEXT:    s_endpgm
697;
698; GFX11-MAD-LABEL: s_v_madak_f32:
699; GFX11-MAD:       ; %bb.0:
700; GFX11-MAD-NEXT:    s_clause 0x1
701; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
702; GFX11-MAD-NEXT:    s_load_b32 s4, s[4:5], 0x34
703; GFX11-MAD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
704; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
705; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
706; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
708; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
709; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, s4, v1
710; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
711; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
712; GFX11-MAD-NEXT:    s_endpgm
713;
714; GFX940-FMA-LABEL: s_v_madak_f32:
715; GFX940-FMA:       ; %bb.0:
716; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
717; GFX940-FMA-NEXT:    s_load_dword s6, s[4:5], 0x34
718; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
719; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
720; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, 0x41200000
721; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
722; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
723; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
724; GFX940-FMA-NEXT:    v_fmac_f32_e32 v2, s6, v1
725; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
726; GFX940-FMA-NEXT:    s_endpgm
727;
728; GFX10-FMA-LABEL: s_v_madak_f32:
729; GFX10-FMA:       ; %bb.0:
730; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
731; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
732; GFX10-FMA-NEXT:    s_load_dword s4, s[4:5], 0x34
733; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
734; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
735; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
736; GFX10-FMA-NEXT:    v_fmaak_f32 v1, s4, v1, 0x41200000
737; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
738; GFX10-FMA-NEXT:    s_endpgm
739;
740; GFX11-FMA-LABEL: s_v_madak_f32:
741; GFX11-FMA:       ; %bb.0:
742; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
743; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
744; GFX11-FMA-NEXT:    s_load_b32 s4, s[4:5], 0x34
745; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
746; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
747; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
749; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
750; GFX11-FMA-NEXT:    v_fmaak_f32 v1, s4, v1, 0x41200000
751; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
752; GFX11-FMA-NEXT:    s_endpgm
753  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
754  %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
755  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
756
757  %a = load float, ptr addrspace(1) %in.a.gep, align 4
758
759  %mul = fmul float %a, %b
760  %madak = fadd float %mul, 10.0
761  store float %madak, ptr addrspace(1) %out.gep, align 4
762  ret void
763}
764
765define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 {
766; GFX6-LABEL: v_s_madak_f32:
767; GFX6:       ; %bb.0:
768; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
769; GFX6-NEXT:    s_mov_b32 s3, 0xf000
770; GFX6-NEXT:    s_mov_b32 s2, 0
771; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
772; GFX6-NEXT:    v_mov_b32_e32 v1, 0
773; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
775; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
776; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
777; GFX6-NEXT:    v_mov_b32_e32 v3, 0x41200000
778; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
779; GFX6-NEXT:    v_mac_f32_e32 v3, s6, v2
780; GFX6-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
781; GFX6-NEXT:    s_endpgm
782;
783; GFX8-LABEL: v_s_madak_f32:
784; GFX8:       ; %bb.0:
785; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
786; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
787; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
788; GFX8-NEXT:    v_mov_b32_e32 v1, s1
789; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
790; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
791; GFX8-NEXT:    flat_load_dword v3, v[0:1]
792; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
793; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x2c
794; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
795; GFX8-NEXT:    v_mov_b32_e32 v1, s1
796; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
797; GFX8-NEXT:    v_mov_b32_e32 v2, 0x41200000
798; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
799; GFX8-NEXT:    s_waitcnt vmcnt(0)
800; GFX8-NEXT:    v_mac_f32_e32 v2, s2, v3
801; GFX8-NEXT:    flat_store_dword v[0:1], v2
802; GFX8-NEXT:    s_endpgm
803;
804; GFX9-LABEL: v_s_madak_f32:
805; GFX9:       ; %bb.0:
806; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
807; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
808; GFX9-NEXT:    v_mov_b32_e32 v2, 0x41200000
809; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
810; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
811; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
812; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
813; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
814; GFX9-NEXT:    v_mac_f32_e32 v2, s2, v1
815; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
816; GFX9-NEXT:    s_endpgm
817;
818; GFX10-MAD-LABEL: v_s_madak_f32:
819; GFX10-MAD:       ; %bb.0:
820; GFX10-MAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
821; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
822; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[0:1]
824; GFX10-MAD-NEXT:    s_clause 0x1
825; GFX10-MAD-NEXT:    s_load_dword s2, s[4:5], 0x2c
826; GFX10-MAD-NEXT:    s_waitcnt_depctr 0xffe3
827; GFX10-MAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
828; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
829; GFX10-MAD-NEXT:    v_madak_f32 v1, s2, v1, 0x41200000
830; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
831; GFX10-MAD-NEXT:    s_endpgm
832;
833; GFX11-MAD-LABEL: v_s_madak_f32:
834; GFX11-MAD:       ; %bb.0:
835; GFX11-MAD-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
836; GFX11-MAD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
837; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
838; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
839; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
840; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[0:1]
841; GFX11-MAD-NEXT:    s_clause 0x1
842; GFX11-MAD-NEXT:    s_load_b32 s2, s[4:5], 0x2c
843; GFX11-MAD-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
844; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
845; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, s2, v1
846; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
847; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
848; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
849; GFX11-MAD-NEXT:    s_endpgm
850;
851; GFX940-FMA-LABEL: v_s_madak_f32:
852; GFX940-FMA:       ; %bb.0:
853; GFX940-FMA-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
854; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
855; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
856; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, 0x41200000
857; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
858; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[0:1]
859; GFX940-FMA-NEXT:    s_load_dword s2, s[4:5], 0x2c
860; GFX940-FMA-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
861; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
862; GFX940-FMA-NEXT:    v_fmac_f32_e32 v2, s2, v1
863; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
864; GFX940-FMA-NEXT:    s_endpgm
865;
866; GFX10-FMA-LABEL: v_s_madak_f32:
867; GFX10-FMA:       ; %bb.0:
868; GFX10-FMA-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
869; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
870; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
871; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[0:1]
872; GFX10-FMA-NEXT:    s_clause 0x1
873; GFX10-FMA-NEXT:    s_load_dword s2, s[4:5], 0x2c
874; GFX10-FMA-NEXT:    s_waitcnt_depctr 0xffe3
875; GFX10-FMA-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
876; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
877; GFX10-FMA-NEXT:    v_fmaak_f32 v1, s2, v1, 0x41200000
878; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
879; GFX10-FMA-NEXT:    s_endpgm
880;
881; GFX11-FMA-LABEL: v_s_madak_f32:
882; GFX11-FMA:       ; %bb.0:
883; GFX11-FMA-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
884; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
885; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
886; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
887; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
888; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[0:1]
889; GFX11-FMA-NEXT:    s_clause 0x1
890; GFX11-FMA-NEXT:    s_load_b32 s2, s[4:5], 0x2c
891; GFX11-FMA-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
892; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
893; GFX11-FMA-NEXT:    v_fmaak_f32 v1, s2, v1, 0x41200000
894; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
895; GFX11-FMA-NEXT:    s_endpgm
896  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
897  %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
898  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
899
900  %b = load float, ptr addrspace(1) %in.b.gep, align 4
901
902  %mul = fmul float %a, %b
903  %madak = fadd float %mul, 10.0
904  store float %madak, ptr addrspace(1) %out.gep, align 4
905  ret void
906}
907
908define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
909; GFX6-LABEL: s_s_madak_f32:
910; GFX6:       ; %bb.0:
911; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
912; GFX6-NEXT:    v_mov_b32_e32 v0, 0x41200000
913; GFX6-NEXT:    s_mov_b32 s7, 0xf000
914; GFX6-NEXT:    s_mov_b32 s6, -1
915; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX6-NEXT:    v_mov_b32_e32 v1, s3
917; GFX6-NEXT:    s_mov_b32 s4, s0
918; GFX6-NEXT:    s_mov_b32 s5, s1
919; GFX6-NEXT:    v_mac_f32_e32 v0, s2, v1
920; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
921; GFX6-NEXT:    s_endpgm
922;
923; GFX8-LABEL: s_s_madak_f32:
924; GFX8:       ; %bb.0:
925; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
926; GFX8-NEXT:    v_mov_b32_e32 v2, 0x41200000
927; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
928; GFX8-NEXT:    v_mov_b32_e32 v0, s3
929; GFX8-NEXT:    v_mac_f32_e32 v2, s2, v0
930; GFX8-NEXT:    v_mov_b32_e32 v0, s0
931; GFX8-NEXT:    v_mov_b32_e32 v1, s1
932; GFX8-NEXT:    flat_store_dword v[0:1], v2
933; GFX8-NEXT:    s_endpgm
934;
935; GFX9-LABEL: s_s_madak_f32:
936; GFX9:       ; %bb.0:
937; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
938; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
939; GFX9-NEXT:    v_mov_b32_e32 v0, 0
940; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
941; GFX9-NEXT:    v_mov_b32_e32 v2, s3
942; GFX9-NEXT:    v_mac_f32_e32 v1, s2, v2
943; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
944; GFX9-NEXT:    s_endpgm
945;
946; GFX10-MAD-LABEL: s_s_madak_f32:
947; GFX10-MAD:       ; %bb.0:
948; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
949; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0
950; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s3
952; GFX10-MAD-NEXT:    v_madak_f32 v0, s2, v0, 0x41200000
953; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[0:1]
954; GFX10-MAD-NEXT:    s_endpgm
955;
956; GFX11-MAD-LABEL: s_s_madak_f32:
957; GFX11-MAD:       ; %bb.0:
958; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
959; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
960; GFX11-MAD-NEXT:    v_mul_f32_e64 v0, s2, s3
961; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
962; GFX11-MAD-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 0x41200000, v0
963; GFX11-MAD-NEXT:    global_store_b32 v1, v0, s[0:1]
964; GFX11-MAD-NEXT:    s_endpgm
965;
966; GFX940-FMA-LABEL: s_s_madak_f32:
967; GFX940-FMA:       ; %bb.0:
968; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
969; GFX940-FMA-NEXT:    v_mov_b32_e32 v1, 0x41200000
970; GFX940-FMA-NEXT:    v_mov_b32_e32 v0, 0
971; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
972; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s3
973; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s2, v2
974; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
975; GFX940-FMA-NEXT:    s_endpgm
976;
977; GFX10-FMA-LABEL: s_s_madak_f32:
978; GFX10-FMA:       ; %bb.0:
979; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
980; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0
981; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
982; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s3
983; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
984; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[0:1]
985; GFX10-FMA-NEXT:    s_endpgm
986;
987; GFX11-FMA-LABEL: s_s_madak_f32:
988; GFX11-FMA:       ; %bb.0:
989; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
990; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX11-FMA-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
992; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
993; GFX11-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
994; GFX11-FMA-NEXT:    global_store_b32 v1, v0, s[0:1]
995; GFX11-FMA-NEXT:    s_endpgm
996  %mul = fmul float %a, %b
997  %madak = fadd float %mul, 10.0
998  store float %madak, ptr addrspace(1) %out, align 4
999  ret void
1000}
1001
1002define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
1003; GFX6-LABEL: no_madak_src0_modifier_f32:
1004; GFX6:       ; %bb.0:
1005; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1006; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1007; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1008; GFX6-NEXT:    s_mov_b32 s6, 0
1009; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1010; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1011; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1012; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1013; GFX6-NEXT:    s_mov_b64 s[10:11], s[6:7]
1014; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1015; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
1016; GFX6-NEXT:    s_mov_b32 s4, 0x41200000
1017; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1018; GFX6-NEXT:    s_waitcnt vmcnt(0)
1019; GFX6-NEXT:    v_mad_f32 v2, |v2|, v3, s4
1020; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1021; GFX6-NEXT:    s_endpgm
1022;
1023; GFX8-LABEL: no_madak_src0_modifier_f32:
1024; GFX8:       ; %bb.0:
1025; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1026; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1027; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1028; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1029; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1030; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1031; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1032; GFX8-NEXT:    v_mov_b32_e32 v3, s5
1033; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1034; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1035; GFX8-NEXT:    flat_load_dword v5, v[0:1]
1036; GFX8-NEXT:    flat_load_dword v2, v[2:3]
1037; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1038; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
1039; GFX8-NEXT:    s_mov_b32 s0, 0x41200000
1040; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1041; GFX8-NEXT:    s_waitcnt vmcnt(0)
1042; GFX8-NEXT:    v_mad_f32 v2, |v5|, v2, s0
1043; GFX8-NEXT:    flat_store_dword v[0:1], v2
1044; GFX8-NEXT:    s_endpgm
1045;
1046; GFX9-LABEL: no_madak_src0_modifier_f32:
1047; GFX9:       ; %bb.0:
1048; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1049; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1050; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1051; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1053; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1054; GFX9-NEXT:    s_mov_b32 s2, 0x41200000
1055; GFX9-NEXT:    s_waitcnt vmcnt(0)
1056; GFX9-NEXT:    v_mad_f32 v1, |v1|, v2, s2
1057; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1058; GFX9-NEXT:    s_endpgm
1059;
1060; GFX10-MAD-LABEL: no_madak_src0_modifier_f32:
1061; GFX10-MAD:       ; %bb.0:
1062; GFX10-MAD-NEXT:    s_clause 0x1
1063; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1064; GFX10-MAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1065; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1066; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
1067; GFX10-MAD-NEXT:    s_clause 0x1
1068; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
1069; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[6:7]
1070; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
1071; GFX10-MAD-NEXT:    v_mad_f32 v1, |v1|, v2, 0x41200000
1072; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
1073; GFX10-MAD-NEXT:    s_endpgm
1074;
1075; GFX11-MAD-LABEL: no_madak_src0_modifier_f32:
1076; GFX11-MAD:       ; %bb.0:
1077; GFX11-MAD-NEXT:    s_clause 0x1
1078; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1079; GFX11-MAD-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1080; GFX11-MAD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1081; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1082; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1083; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
1084; GFX11-MAD-NEXT:    s_clause 0x1
1085; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
1086; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[4:5]
1087; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
1088; GFX11-MAD-NEXT:    v_mul_f32_e64 v1, |v1|, v2
1089; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1090; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
1091; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
1092; GFX11-MAD-NEXT:    s_endpgm
1093;
1094; GFX940-FMA-LABEL: no_madak_src0_modifier_f32:
1095; GFX940-FMA:       ; %bb.0:
1096; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1097; GFX940-FMA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1098; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1099; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1100; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1101; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
1102; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[6:7]
1103; GFX940-FMA-NEXT:    s_mov_b32 s2, 0x41200000
1104; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
1105; GFX940-FMA-NEXT:    v_fma_f32 v1, |v1|, v2, s2
1106; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
1107; GFX940-FMA-NEXT:    s_endpgm
1108;
1109; GFX10-FMA-LABEL: no_madak_src0_modifier_f32:
1110; GFX10-FMA:       ; %bb.0:
1111; GFX10-FMA-NEXT:    s_clause 0x1
1112; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1113; GFX10-FMA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1114; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1115; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX10-FMA-NEXT:    s_clause 0x1
1117; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
1118; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[6:7]
1119; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
1120; GFX10-FMA-NEXT:    v_fma_f32 v1, |v1|, v2, 0x41200000
1121; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
1122; GFX10-FMA-NEXT:    s_endpgm
1123;
1124; GFX11-FMA-LABEL: no_madak_src0_modifier_f32:
1125; GFX11-FMA:       ; %bb.0:
1126; GFX11-FMA-NEXT:    s_clause 0x1
1127; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1128; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1129; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1130; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1131; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1132; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1133; GFX11-FMA-NEXT:    s_clause 0x1
1134; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1135; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1136; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1137; GFX11-FMA-NEXT:    v_fma_f32 v1, |v1|, v2, 0x41200000
1138; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1139; GFX11-FMA-NEXT:    s_endpgm
1140  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
1141  %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
1142  %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
1143  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1144
1145  %a = load float, ptr addrspace(1) %in.a.gep, align 4
1146  %b = load float, ptr addrspace(1) %in.b.gep, align 4
1147
1148  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
1149
1150  %mul = fmul float %a.fabs, %b
1151  %madak = fadd float %mul, 10.0
1152  store float %madak, ptr addrspace(1) %out.gep, align 4
1153  ret void
1154}
1155
1156define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
1157; GFX6-LABEL: no_madak_src1_modifier_f32:
1158; GFX6:       ; %bb.0:
1159; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1160; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1161; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1162; GFX6-NEXT:    s_mov_b32 s6, 0
1163; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1164; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1166; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1167; GFX6-NEXT:    s_mov_b64 s[10:11], s[6:7]
1168; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1169; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
1170; GFX6-NEXT:    s_mov_b32 s4, 0x41200000
1171; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1172; GFX6-NEXT:    s_waitcnt vmcnt(0)
1173; GFX6-NEXT:    v_mad_f32 v2, v2, |v3|, s4
1174; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1175; GFX6-NEXT:    s_endpgm
1176;
1177; GFX8-LABEL: no_madak_src1_modifier_f32:
1178; GFX8:       ; %bb.0:
1179; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1180; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1181; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1182; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1183; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1184; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1185; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1186; GFX8-NEXT:    v_mov_b32_e32 v3, s5
1187; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1188; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1189; GFX8-NEXT:    flat_load_dword v5, v[0:1]
1190; GFX8-NEXT:    flat_load_dword v2, v[2:3]
1191; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1192; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
1193; GFX8-NEXT:    s_mov_b32 s0, 0x41200000
1194; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1195; GFX8-NEXT:    s_waitcnt vmcnt(0)
1196; GFX8-NEXT:    v_mad_f32 v2, v5, |v2|, s0
1197; GFX8-NEXT:    flat_store_dword v[0:1], v2
1198; GFX8-NEXT:    s_endpgm
1199;
1200; GFX9-LABEL: no_madak_src1_modifier_f32:
1201; GFX9:       ; %bb.0:
1202; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1203; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1204; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1205; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1206; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1207; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1208; GFX9-NEXT:    s_mov_b32 s2, 0x41200000
1209; GFX9-NEXT:    s_waitcnt vmcnt(0)
1210; GFX9-NEXT:    v_mad_f32 v1, v1, |v2|, s2
1211; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1212; GFX9-NEXT:    s_endpgm
1213;
1214; GFX10-MAD-LABEL: no_madak_src1_modifier_f32:
1215; GFX10-MAD:       ; %bb.0:
1216; GFX10-MAD-NEXT:    s_clause 0x1
1217; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1218; GFX10-MAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1219; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1220; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
1221; GFX10-MAD-NEXT:    s_clause 0x1
1222; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
1223; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[6:7]
1224; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
1225; GFX10-MAD-NEXT:    v_mad_f32 v1, v1, |v2|, 0x41200000
1226; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
1227; GFX10-MAD-NEXT:    s_endpgm
1228;
1229; GFX11-MAD-LABEL: no_madak_src1_modifier_f32:
1230; GFX11-MAD:       ; %bb.0:
1231; GFX11-MAD-NEXT:    s_clause 0x1
1232; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1233; GFX11-MAD-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1234; GFX11-MAD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1235; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1236; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1237; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX11-MAD-NEXT:    s_clause 0x1
1239; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
1240; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[4:5]
1241; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
1242; GFX11-MAD-NEXT:    v_mul_f32_e64 v1, v1, |v2|
1243; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1244; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
1245; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
1246; GFX11-MAD-NEXT:    s_endpgm
1247;
1248; GFX940-FMA-LABEL: no_madak_src1_modifier_f32:
1249; GFX940-FMA:       ; %bb.0:
1250; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1251; GFX940-FMA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1252; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1253; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1254; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1255; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
1256; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[6:7]
1257; GFX940-FMA-NEXT:    s_mov_b32 s2, 0x41200000
1258; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
1259; GFX940-FMA-NEXT:    v_fma_f32 v1, v1, |v2|, s2
1260; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
1261; GFX940-FMA-NEXT:    s_endpgm
1262;
1263; GFX10-FMA-LABEL: no_madak_src1_modifier_f32:
1264; GFX10-FMA:       ; %bb.0:
1265; GFX10-FMA-NEXT:    s_clause 0x1
1266; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1267; GFX10-FMA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1268; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1269; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX10-FMA-NEXT:    s_clause 0x1
1271; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
1272; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[6:7]
1273; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
1274; GFX10-FMA-NEXT:    v_fma_f32 v1, v1, |v2|, 0x41200000
1275; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
1276; GFX10-FMA-NEXT:    s_endpgm
1277;
1278; GFX11-FMA-LABEL: no_madak_src1_modifier_f32:
1279; GFX11-FMA:       ; %bb.0:
1280; GFX11-FMA-NEXT:    s_clause 0x1
1281; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1282; GFX11-FMA-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1283; GFX11-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1284; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1285; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1286; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX11-FMA-NEXT:    s_clause 0x1
1288; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
1289; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[4:5]
1290; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1291; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, |v2|, 0x41200000
1292; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
1293; GFX11-FMA-NEXT:    s_endpgm
1294  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
1295  %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
1296  %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
1297  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1298
1299  %a = load float, ptr addrspace(1) %in.a.gep, align 4
1300  %b = load float, ptr addrspace(1) %in.b.gep, align 4
1301
1302  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
1303
1304  %mul = fmul float %a, %b.fabs
1305  %madak = fadd float %mul, 10.0
1306  store float %madak, ptr addrspace(1) %out.gep, align 4
1307  ret void
1308}
1309
1310; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10
1311; because the implicit immediate already uses the constant bus.
1312; On GFX10+ we can use two scalar operands.
1313define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
1314; GFX6-LABEL: madak_constant_bus_violation:
1315; GFX6:       ; %bb.0: ; %bb
1316; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x9
1317; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1318; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
1319; GFX6-NEXT:    s_cbranch_scc1 .LBB9_2
1320; GFX6-NEXT:  ; %bb.1: ; %bb3
1321; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1322; GFX6-NEXT:    s_mov_b32 s2, -1
1323; GFX6-NEXT:    v_mov_b32_e32 v0, 0
1324; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1325; GFX6-NEXT:    s_waitcnt vmcnt(0)
1326; GFX6-NEXT:  .LBB9_2: ; %bb4
1327; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1328; GFX6-NEXT:    s_mov_b32 s2, -1
1329; GFX6-NEXT:    s_waitcnt expcnt(0)
1330; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
1331; GFX6-NEXT:    s_waitcnt vmcnt(0)
1332; GFX6-NEXT:    s_load_dword s0, s[4:5], 0x12
1333; GFX6-NEXT:    v_mov_b32_e32 v1, 0x42280000
1334; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1335; GFX6-NEXT:    v_mac_f32_e64 v1, s0, 0.5
1336; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v0
1337; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1338; GFX6-NEXT:    s_waitcnt vmcnt(0)
1339; GFX6-NEXT:    s_endpgm
1340;
1341; GFX8-LABEL: madak_constant_bus_violation:
1342; GFX8:       ; %bb.0: ; %bb
1343; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x24
1344; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1345; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
1346; GFX8-NEXT:    s_cbranch_scc1 .LBB9_2
1347; GFX8-NEXT:  ; %bb.1: ; %bb3
1348; GFX8-NEXT:    v_mov_b32_e32 v0, 0
1349; GFX8-NEXT:    flat_store_dword v[0:1], v0
1350; GFX8-NEXT:    s_waitcnt vmcnt(0)
1351; GFX8-NEXT:  .LBB9_2: ; %bb4
1352; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
1353; GFX8-NEXT:    s_waitcnt vmcnt(0)
1354; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x48
1355; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42280000
1356; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1357; GFX8-NEXT:    v_mac_f32_e64 v1, s0, 0.5
1358; GFX8-NEXT:    v_mul_f32_e32 v0, v1, v0
1359; GFX8-NEXT:    flat_store_dword v[0:1], v0
1360; GFX8-NEXT:    s_waitcnt vmcnt(0)
1361; GFX8-NEXT:    s_endpgm
1362;
1363; GFX9-LABEL: madak_constant_bus_violation:
1364; GFX9:       ; %bb.0: ; %bb
1365; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x24
1366; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1367; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
1368; GFX9-NEXT:    s_cbranch_scc1 .LBB9_2
1369; GFX9-NEXT:  ; %bb.1: ; %bb3
1370; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1371; GFX9-NEXT:    global_store_dword v[0:1], v0, off
1372; GFX9-NEXT:    s_waitcnt vmcnt(0)
1373; GFX9-NEXT:  .LBB9_2: ; %bb4
1374; GFX9-NEXT:    global_load_dword v0, v[0:1], off glc
1375; GFX9-NEXT:    s_waitcnt vmcnt(0)
1376; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x48
1377; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42280000
1378; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1379; GFX9-NEXT:    v_mac_f32_e64 v1, s0, 0.5
1380; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
1381; GFX9-NEXT:    global_store_dword v[0:1], v0, off
1382; GFX9-NEXT:    s_waitcnt vmcnt(0)
1383; GFX9-NEXT:    s_endpgm
1384;
1385; GFX10-MAD-LABEL: madak_constant_bus_violation:
1386; GFX10-MAD:       ; %bb.0: ; %bb
1387; GFX10-MAD-NEXT:    s_load_dword s0, s[4:5], 0x24
1388; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX10-MAD-NEXT:    s_cmp_lg_u32 s0, 0
1390; GFX10-MAD-NEXT:    s_cbranch_scc1 .LBB9_2
1391; GFX10-MAD-NEXT:  ; %bb.1: ; %bb3
1392; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, 0
1393; GFX10-MAD-NEXT:    global_store_dword v[0:1], v0, off
1394; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
1395; GFX10-MAD-NEXT:  .LBB9_2: ; %bb4
1396; GFX10-MAD-NEXT:    global_load_dword v0, v[0:1], off glc dlc
1397; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
1398; GFX10-MAD-NEXT:    s_load_dword s0, s[4:5], 0x48
1399; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0.5
1400; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
1401; GFX10-MAD-NEXT:    v_madak_f32 v1, s0, v1, 0x42280000
1402; GFX10-MAD-NEXT:    v_mul_f32_e32 v0, v1, v0
1403; GFX10-MAD-NEXT:    global_store_dword v[0:1], v0, off
1404; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
1405; GFX10-MAD-NEXT:    s_endpgm
1406;
1407; GFX11-MAD-LABEL: madak_constant_bus_violation:
1408; GFX11-MAD:       ; %bb.0: ; %bb
1409; GFX11-MAD-NEXT:    s_load_b32 s0, s[4:5], 0x24
1410; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX11-MAD-NEXT:    s_cmp_lg_u32 s0, 0
1412; GFX11-MAD-NEXT:    s_cbranch_scc1 .LBB9_2
1413; GFX11-MAD-NEXT:  ; %bb.1: ; %bb3
1414; GFX11-MAD-NEXT:    v_mov_b32_e32 v0, 0
1415; GFX11-MAD-NEXT:    global_store_b32 v[0:1], v0, off dlc
1416; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
1417; GFX11-MAD-NEXT:  .LBB9_2: ; %bb4
1418; GFX11-MAD-NEXT:    global_load_b32 v0, v[0:1], off glc dlc
1419; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
1420; GFX11-MAD-NEXT:    s_load_b32 s0, s[4:5], 0x48
1421; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
1422; GFX11-MAD-NEXT:    v_mul_f32_e64 v1, s0, 0.5
1423; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1424; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x42280000, v1
1425; GFX11-MAD-NEXT:    v_mul_f32_e32 v0, v1, v0
1426; GFX11-MAD-NEXT:    global_store_b32 v[0:1], v0, off dlc
1427; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
1428; GFX11-MAD-NEXT:    s_endpgm
1429;
1430; GFX940-FMA-LABEL: madak_constant_bus_violation:
1431; GFX940-FMA:       ; %bb.0: ; %bb
1432; GFX940-FMA-NEXT:    s_load_dword s0, s[4:5], 0x24
1433; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1434; GFX940-FMA-NEXT:    s_cmp_lg_u32 s0, 0
1435; GFX940-FMA-NEXT:    s_cbranch_scc1 .LBB9_2
1436; GFX940-FMA-NEXT:  ; %bb.1: ; %bb3
1437; GFX940-FMA-NEXT:    v_mov_b32_e32 v0, 0
1438; GFX940-FMA-NEXT:    global_store_dword v[0:1], v0, off sc0 sc1
1439; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
1440; GFX940-FMA-NEXT:  .LBB9_2: ; %bb4
1441; GFX940-FMA-NEXT:    global_load_dword v0, v[0:1], off sc0 sc1
1442; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
1443; GFX940-FMA-NEXT:    s_load_dword s0, s[4:5], 0x48
1444; GFX940-FMA-NEXT:    v_mov_b32_e32 v1, 0x42280000
1445; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1446; GFX940-FMA-NEXT:    v_fmac_f32_e64 v1, s0, 0.5
1447; GFX940-FMA-NEXT:    v_mul_f32_e32 v0, v1, v0
1448; GFX940-FMA-NEXT:    global_store_dword v[0:1], v0, off sc0 sc1
1449; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
1450; GFX940-FMA-NEXT:    s_endpgm
1451;
1452; GFX10-FMA-LABEL: madak_constant_bus_violation:
1453; GFX10-FMA:       ; %bb.0: ; %bb
1454; GFX10-FMA-NEXT:    s_load_dword s0, s[4:5], 0x24
1455; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1456; GFX10-FMA-NEXT:    s_cmp_lg_u32 s0, 0
1457; GFX10-FMA-NEXT:    s_cbranch_scc1 .LBB9_2
1458; GFX10-FMA-NEXT:  ; %bb.1: ; %bb3
1459; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, 0
1460; GFX10-FMA-NEXT:    global_store_dword v[0:1], v0, off
1461; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
1462; GFX10-FMA-NEXT:  .LBB9_2: ; %bb4
1463; GFX10-FMA-NEXT:    global_load_dword v0, v[0:1], off glc dlc
1464; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
1465; GFX10-FMA-NEXT:    s_load_dword s0, s[4:5], 0x48
1466; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0.5
1467; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1468; GFX10-FMA-NEXT:    v_fmaak_f32 v1, s0, v1, 0x42280000
1469; GFX10-FMA-NEXT:    v_mul_f32_e32 v0, v1, v0
1470; GFX10-FMA-NEXT:    global_store_dword v[0:1], v0, off
1471; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
1472; GFX10-FMA-NEXT:    s_endpgm
1473;
1474; GFX11-FMA-LABEL: madak_constant_bus_violation:
1475; GFX11-FMA:       ; %bb.0: ; %bb
1476; GFX11-FMA-NEXT:    s_load_b32 s0, s[4:5], 0x24
1477; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1478; GFX11-FMA-NEXT:    s_cmp_lg_u32 s0, 0
1479; GFX11-FMA-NEXT:    s_cbranch_scc1 .LBB9_2
1480; GFX11-FMA-NEXT:  ; %bb.1: ; %bb3
1481; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
1482; GFX11-FMA-NEXT:    global_store_b32 v[0:1], v0, off dlc
1483; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
1484; GFX11-FMA-NEXT:  .LBB9_2: ; %bb4
1485; GFX11-FMA-NEXT:    global_load_b32 v0, v[0:1], off glc dlc
1486; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
1487; GFX11-FMA-NEXT:    s_load_b32 s0, s[4:5], 0x48
1488; GFX11-FMA-NEXT:    v_mov_b32_e32 v1, 0.5
1489; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1491; GFX11-FMA-NEXT:    v_fmaak_f32 v1, s0, v1, 0x42280000
1492; GFX11-FMA-NEXT:    v_mul_f32_e32 v0, v1, v0
1493; GFX11-FMA-NEXT:    global_store_b32 v[0:1], v0, off dlc
1494; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
1495; GFX11-FMA-NEXT:    s_endpgm
1496bb:
1497  %tmp = icmp eq i32 %arg1, 0
1498  br i1 %tmp, label %bb3, label %bb4
1499
1500bb3:
1501  store volatile float 0.0, ptr addrspace(1) undef
1502  br label %bb4
1503
1504bb4:
1505  %vgpr = load volatile float, ptr addrspace(1) undef
1506  %tmp0 = fmul float %sgpr0, 0.5
1507  %tmp1 = fadd float %tmp0, 42.0
1508  %tmp2 = fmul float %tmp1, %vgpr
1509  store volatile float %tmp2, ptr addrspace(1) undef, align 4
1510  ret void
1511}
1512
1513attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1514