xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll (revision 66bd3cd75b32ccfa8d228c200cf4fbf72d49fd1f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
4
5define float @test_amdgcn_dot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
6; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8:
7; GFX12:       ; %bb.0: ; %entry
8; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9; GFX12-NEXT:    s_wait_expcnt 0x0
10; GFX12-NEXT:    s_wait_samplecnt 0x0
11; GFX12-NEXT:    s_wait_bvhcnt 0x0
12; GFX12-NEXT:    s_wait_kmcnt 0x0
13; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2
14; GFX12-NEXT:    s_setpc_b64 s[30:31]
15entry:
16  %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
17  ret float %ret
18}
19
20define float @test_amdgcn_dot4_f32_fp8_bf8_fabs(i32 %a, i32 %b, float %c) {
21; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fabs:
22; GFX12:       ; %bb.0: ; %entry
23; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
24; GFX12-NEXT:    s_wait_expcnt 0x0
25; GFX12-NEXT:    s_wait_samplecnt 0x0
26; GFX12-NEXT:    s_wait_bvhcnt 0x0
27; GFX12-NEXT:    s_wait_kmcnt 0x0
28; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
29; GFX12-NEXT:    s_setpc_b64 s[30:31]
30entry:
31  %fabs.c = call float @llvm.fabs.f32(float %c)
32  %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.c)
33  ret float %ret
34}
35
36define float @test_amdgcn_dot4_f32_fp8_bf8_fneg(i32 %a, i32 %b, float %c) {
37; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fneg:
38; GFX12:       ; %bb.0: ; %entry
39; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
40; GFX12-NEXT:    s_wait_expcnt 0x0
41; GFX12-NEXT:    s_wait_samplecnt 0x0
42; GFX12-NEXT:    s_wait_bvhcnt 0x0
43; GFX12-NEXT:    s_wait_kmcnt 0x0
44; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
45; GFX12-NEXT:    s_setpc_b64 s[30:31]
46entry:
47  %fneg.c = fneg float %c
48  %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.c)
49  ret float %ret
50}
51
52define float @test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
53; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg:
54; GFX12:       ; %bb.0: ; %entry
55; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
56; GFX12-NEXT:    s_wait_expcnt 0x0
57; GFX12-NEXT:    s_wait_samplecnt 0x0
58; GFX12-NEXT:    s_wait_bvhcnt 0x0
59; GFX12-NEXT:    s_wait_kmcnt 0x0
60; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
61; GFX12-NEXT:    s_setpc_b64 s[30:31]
62entry:
63  %fneg.c = fneg float %c
64  %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
65  %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.fneg.c)
66  ret float %ret
67}
68
69define float @test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
70; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs:
71; GFX12:       ; %bb.0: ; %entry
72; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
73; GFX12-NEXT:    s_wait_expcnt 0x0
74; GFX12-NEXT:    s_wait_samplecnt 0x0
75; GFX12-NEXT:    s_wait_bvhcnt 0x0
76; GFX12-NEXT:    s_wait_kmcnt 0x0
77; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
78; GFX12-NEXT:    s_setpc_b64 s[30:31]
79entry:
80  %fabs.c = call float @llvm.fabs.f32(float %c)
81  %fneg.fabs.c = fneg float %fabs.c
82  %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.fabs.c)
83  ret float %ret
84}
85
86define float @test_amdgcn_dot4_f32_bf8_fp8(i32 %a, i32 %b, float %c) {
87; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8:
88; GFX12:       ; %bb.0: ; %entry
89; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
90; GFX12-NEXT:    s_wait_expcnt 0x0
91; GFX12-NEXT:    s_wait_samplecnt 0x0
92; GFX12-NEXT:    s_wait_bvhcnt 0x0
93; GFX12-NEXT:    s_wait_kmcnt 0x0
94; GFX12-NEXT:    v_dot4_f32_bf8_fp8 v0, v0, v1, v2
95; GFX12-NEXT:    s_setpc_b64 s[30:31]
96entry:
97  %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
98  ret float %ret
99}
100
101define float @test_amdgcn_dot4_f32_bf8_fp8_fabs(i32 %a, i32 %b, float %c) {
102; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fabs:
103; GFX12:       ; %bb.0: ; %entry
104; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
105; GFX12-NEXT:    s_wait_expcnt 0x0
106; GFX12-NEXT:    s_wait_samplecnt 0x0
107; GFX12-NEXT:    s_wait_bvhcnt 0x0
108; GFX12-NEXT:    s_wait_kmcnt 0x0
109; GFX12-NEXT:    v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
110; GFX12-NEXT:    s_setpc_b64 s[30:31]
111entry:
112  %fabs.c = call float @llvm.fabs.f32(float %c)
113  %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.c)
114  ret float %ret
115}
116
117define float @test_amdgcn_dot4_f32_bf8_fp8_fneg(i32 %a, i32 %b, float %c) {
118; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fneg:
119; GFX12:       ; %bb.0: ; %entry
120; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
121; GFX12-NEXT:    s_wait_expcnt 0x0
122; GFX12-NEXT:    s_wait_samplecnt 0x0
123; GFX12-NEXT:    s_wait_bvhcnt 0x0
124; GFX12-NEXT:    s_wait_kmcnt 0x0
125; GFX12-NEXT:    v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
126; GFX12-NEXT:    s_setpc_b64 s[30:31]
127entry:
128  %fneg.c = fneg float %c
129  %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.c)
130  ret float %ret
131}
132
133define float @test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
134; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg:
135; GFX12:       ; %bb.0: ; %entry
136; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
137; GFX12-NEXT:    s_wait_expcnt 0x0
138; GFX12-NEXT:    s_wait_samplecnt 0x0
139; GFX12-NEXT:    s_wait_bvhcnt 0x0
140; GFX12-NEXT:    s_wait_kmcnt 0x0
141; GFX12-NEXT:    v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
142; GFX12-NEXT:    s_setpc_b64 s[30:31]
143entry:
144  %fneg.c = fneg float %c
145  %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
146  %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.fneg.c)
147  ret float %ret
148}
149
150define float @test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
151; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs:
152; GFX12:       ; %bb.0: ; %entry
153; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
154; GFX12-NEXT:    s_wait_expcnt 0x0
155; GFX12-NEXT:    s_wait_samplecnt 0x0
156; GFX12-NEXT:    s_wait_bvhcnt 0x0
157; GFX12-NEXT:    s_wait_kmcnt 0x0
158; GFX12-NEXT:    v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
159; GFX12-NEXT:    s_setpc_b64 s[30:31]
160entry:
161  %fabs.c = call float @llvm.fabs.f32(float %c)
162  %fneg.fabs.c = fneg float %fabs.c
163  %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.fabs.c)
164  ret float %ret
165}
166
167define float @test_amdgcn_dot4_f32_fp8_fp8(i32 %a, i32 %b, float %c) {
168; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8:
169; GFX12:       ; %bb.0: ; %entry
170; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
171; GFX12-NEXT:    s_wait_expcnt 0x0
172; GFX12-NEXT:    s_wait_samplecnt 0x0
173; GFX12-NEXT:    s_wait_bvhcnt 0x0
174; GFX12-NEXT:    s_wait_kmcnt 0x0
175; GFX12-NEXT:    v_dot4_f32_fp8_fp8 v0, v0, v1, v2
176; GFX12-NEXT:    s_setpc_b64 s[30:31]
177entry:
178  %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
179  ret float %ret
180}
181
182define float @test_amdgcn_dot4_f32_fp8_fp8_fabs(i32 %a, i32 %b, float %c) {
183; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fabs:
184; GFX12:       ; %bb.0: ; %entry
185; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
186; GFX12-NEXT:    s_wait_expcnt 0x0
187; GFX12-NEXT:    s_wait_samplecnt 0x0
188; GFX12-NEXT:    s_wait_bvhcnt 0x0
189; GFX12-NEXT:    s_wait_kmcnt 0x0
190; GFX12-NEXT:    v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
191; GFX12-NEXT:    s_setpc_b64 s[30:31]
192entry:
193  %fabs.c = call float @llvm.fabs.f32(float %c)
194  %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.c)
195  ret float %ret
196}
197
198define float @test_amdgcn_dot4_f32_fp8_fp8_fneg(i32 %a, i32 %b, float %c) {
199; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fneg:
200; GFX12:       ; %bb.0: ; %entry
201; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
202; GFX12-NEXT:    s_wait_expcnt 0x0
203; GFX12-NEXT:    s_wait_samplecnt 0x0
204; GFX12-NEXT:    s_wait_bvhcnt 0x0
205; GFX12-NEXT:    s_wait_kmcnt 0x0
206; GFX12-NEXT:    v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
207; GFX12-NEXT:    s_setpc_b64 s[30:31]
208entry:
209  %fneg.c = fneg float %c
210  %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.c)
211  ret float %ret
212}
213
214define float @test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
215; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg:
216; GFX12:       ; %bb.0: ; %entry
217; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
218; GFX12-NEXT:    s_wait_expcnt 0x0
219; GFX12-NEXT:    s_wait_samplecnt 0x0
220; GFX12-NEXT:    s_wait_bvhcnt 0x0
221; GFX12-NEXT:    s_wait_kmcnt 0x0
222; GFX12-NEXT:    v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
223; GFX12-NEXT:    s_setpc_b64 s[30:31]
224entry:
225  %fneg.c = fneg float %c
226  %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
227  %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.fneg.c)
228  ret float %ret
229}
230
231define float @test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
232; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs:
233; GFX12:       ; %bb.0: ; %entry
234; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
235; GFX12-NEXT:    s_wait_expcnt 0x0
236; GFX12-NEXT:    s_wait_samplecnt 0x0
237; GFX12-NEXT:    s_wait_bvhcnt 0x0
238; GFX12-NEXT:    s_wait_kmcnt 0x0
239; GFX12-NEXT:    v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
240; GFX12-NEXT:    s_setpc_b64 s[30:31]
241entry:
242  %fabs.c = call float @llvm.fabs.f32(float %c)
243  %fneg.fabs.c = fneg float %fabs.c
244  %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.fabs.c)
245  ret float %ret
246}
247
248define float @test_amdgcn_dot4_f32_bf8_bf8(i32 %a, i32 %b, float %c) {
249; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8:
250; GFX12:       ; %bb.0: ; %entry
251; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
252; GFX12-NEXT:    s_wait_expcnt 0x0
253; GFX12-NEXT:    s_wait_samplecnt 0x0
254; GFX12-NEXT:    s_wait_bvhcnt 0x0
255; GFX12-NEXT:    s_wait_kmcnt 0x0
256; GFX12-NEXT:    v_dot4_f32_bf8_bf8 v0, v0, v1, v2
257; GFX12-NEXT:    s_setpc_b64 s[30:31]
258entry:
259  %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
260  ret float %ret
261}
262
263define float @test_amdgcn_dot4_f32_bf8_bf8_fabs(i32 %a, i32 %b, float %c) {
264; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fabs:
265; GFX12:       ; %bb.0: ; %entry
266; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
267; GFX12-NEXT:    s_wait_expcnt 0x0
268; GFX12-NEXT:    s_wait_samplecnt 0x0
269; GFX12-NEXT:    s_wait_bvhcnt 0x0
270; GFX12-NEXT:    s_wait_kmcnt 0x0
271; GFX12-NEXT:    v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
272; GFX12-NEXT:    s_setpc_b64 s[30:31]
273entry:
274  %fabs.c = call float @llvm.fabs.f32(float %c)
275  %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.c)
276  ret float %ret
277}
278
279define float @test_amdgcn_dot4_f32_bf8_bf8_fneg(i32 %a, i32 %b, float %c) {
280; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fneg:
281; GFX12:       ; %bb.0: ; %entry
282; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
283; GFX12-NEXT:    s_wait_expcnt 0x0
284; GFX12-NEXT:    s_wait_samplecnt 0x0
285; GFX12-NEXT:    s_wait_bvhcnt 0x0
286; GFX12-NEXT:    s_wait_kmcnt 0x0
287; GFX12-NEXT:    v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
288; GFX12-NEXT:    s_setpc_b64 s[30:31]
289entry:
290  %fneg.c = fneg float %c
291  %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.c)
292  ret float %ret
293}
294
295define float @test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
296; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg:
297; GFX12:       ; %bb.0: ; %entry
298; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
299; GFX12-NEXT:    s_wait_expcnt 0x0
300; GFX12-NEXT:    s_wait_samplecnt 0x0
301; GFX12-NEXT:    s_wait_bvhcnt 0x0
302; GFX12-NEXT:    s_wait_kmcnt 0x0
303; GFX12-NEXT:    v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
304; GFX12-NEXT:    s_setpc_b64 s[30:31]
305entry:
306  %fneg.c = fneg float %c
307  %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
308  %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.fneg.c)
309  ret float %ret
310}
311
312define float @test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
313; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs:
314; GFX12:       ; %bb.0: ; %entry
315; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
316; GFX12-NEXT:    s_wait_expcnt 0x0
317; GFX12-NEXT:    s_wait_samplecnt 0x0
318; GFX12-NEXT:    s_wait_bvhcnt 0x0
319; GFX12-NEXT:    s_wait_kmcnt 0x0
320; GFX12-NEXT:    v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
321; GFX12-NEXT:    s_setpc_b64 s[30:31]
322entry:
323  %fabs.c = call float @llvm.fabs.f32(float %c)
324  %fneg.fabs.c = fneg float %fabs.c
325  %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.fabs.c)
326  ret float %ret
327}
328
329declare float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
330declare float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
331declare float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
332declare float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
333
334declare float @llvm.fabs.f32(float %a)
335
336