xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll (revision 5a81a559d69fb84e1e8ef623ac4b642081c14c51)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
3; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
4; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950
5; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
6
7declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
8
9define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
10; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
11; GFX11:       ; %bb.0: ; %entry
12; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
13; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX11-NEXT:    s_load_b32 s6, s[6:7], 0x0
15; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
16; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
17; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
19; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
20; GFX11-NEXT:    v_dot2_f32_bf16 v0, s2, s3, v0 clamp
21; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
22; GFX11-NEXT:    s_endpgm
23;
24; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
25; GFX950:       ; %bb.0: ; %entry
26; GFX950-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
27; GFX950-NEXT:    v_mov_b32_e32 v0, 0
28; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX950-NEXT:    s_load_dword s0, s[12:13], 0x0
30; GFX950-NEXT:    s_load_dword s1, s[14:15], 0x0
31; GFX950-NEXT:    s_load_dword s2, s[10:11], 0x0
32; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX950-NEXT:    v_mov_b32_e32 v1, s0
34; GFX950-NEXT:    v_mov_b32_e32 v2, s1
35; GFX950-NEXT:    v_dot2_f32_bf16 v1, s2, v1, v2 clamp
36; GFX950-NEXT:    s_nop 2
37; GFX950-NEXT:    global_store_dword v0, v1, s[8:9]
38; GFX950-NEXT:    s_endpgm
39;
40; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
41; GFX950-ISEL:       ; %bb.0: ; %entry
42; GFX950-ISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
43; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, 0
44; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
45; GFX950-ISEL-NEXT:    s_load_dword s0, s[12:13], 0x0
46; GFX950-ISEL-NEXT:    s_load_dword s1, s[14:15], 0x0
47; GFX950-ISEL-NEXT:    s_load_dword s2, s[10:11], 0x0
48; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s0
50; GFX950-ISEL-NEXT:    v_mov_b32_e32 v2, s1
51; GFX950-ISEL-NEXT:    v_dot2_f32_bf16 v1, s2, v1, v2 clamp
52; GFX950-ISEL-NEXT:    s_nop 2
53; GFX950-ISEL-NEXT:    global_store_dword v0, v1, s[8:9]
54; GFX950-ISEL-NEXT:    s_endpgm
55    ptr addrspace(1) %r,
56    ptr addrspace(1) %a,
57    ptr addrspace(1) %b,
58    ptr addrspace(1) %c) {
59entry:
60  %a.val = load <2 x bfloat>, ptr addrspace(1) %a
61  %b.val = load <2 x bfloat>, ptr addrspace(1) %b
62  %c.val = load float, ptr addrspace(1) %c
63  %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 1)
64  store float %r.val, ptr addrspace(1) %r
65  ret void
66}
67
68
69define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
70; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
71; GFX11:       ; %bb.0: ; %entry
72; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
73; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX11-NEXT:    s_load_b32 s6, s[6:7], 0x0
75; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
76; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
77; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
79; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
80; GFX11-NEXT:    v_dot2_f32_bf16 v0, s2, s3, v0
81; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
82; GFX11-NEXT:    s_endpgm
83;
84; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
85; GFX950:       ; %bb.0: ; %entry
86; GFX950-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
87; GFX950-NEXT:    v_mov_b32_e32 v0, 0
88; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX950-NEXT:    s_load_dword s0, s[12:13], 0x0
90; GFX950-NEXT:    s_load_dword s1, s[14:15], 0x0
91; GFX950-NEXT:    s_load_dword s2, s[10:11], 0x0
92; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX950-NEXT:    v_mov_b32_e32 v1, s0
94; GFX950-NEXT:    v_mov_b32_e32 v2, s1
95; GFX950-NEXT:    v_dot2c_f32_bf16_e32 v2, s2, v1
96; GFX950-NEXT:    s_nop 2
97; GFX950-NEXT:    global_store_dword v0, v2, s[8:9]
98; GFX950-NEXT:    s_endpgm
99;
100; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
101; GFX950-ISEL:       ; %bb.0: ; %entry
102; GFX950-ISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
103; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, 0
104; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX950-ISEL-NEXT:    s_load_dword s0, s[12:13], 0x0
106; GFX950-ISEL-NEXT:    s_load_dword s1, s[14:15], 0x0
107; GFX950-ISEL-NEXT:    s_load_dword s2, s[10:11], 0x0
108; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s0
110; GFX950-ISEL-NEXT:    v_mov_b32_e32 v2, s1
111; GFX950-ISEL-NEXT:    v_dot2c_f32_bf16_e32 v2, s2, v1
112; GFX950-ISEL-NEXT:    s_nop 2
113; GFX950-ISEL-NEXT:    global_store_dword v0, v2, s[8:9]
114; GFX950-ISEL-NEXT:    s_endpgm
115    ptr addrspace(1) %r,
116    ptr addrspace(1) %a,
117    ptr addrspace(1) %b,
118    ptr addrspace(1) %c) {
119entry:
120  %a.val = load <2 x bfloat>, ptr addrspace(1) %a
121  %b.val = load <2 x bfloat>, ptr addrspace(1) %b
122  %c.val = load float, ptr addrspace(1) %c
123  %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 0)
124  store float %r.val, ptr addrspace(1) %r
125  ret void
126}
127