xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (revision bf274b3d8044cab8478bef50ccf96313e4dbf21e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX6 %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
8
9define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
10; GFX6-LABEL: cos_f16:
11; GFX6:       ; %bb.0:
12; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
13; GFX6-NEXT:    s_mov_b32 s7, 0xf000
14; GFX6-NEXT:    s_mov_b32 s6, -1
15; GFX6-NEXT:    s_mov_b32 s10, s6
16; GFX6-NEXT:    s_mov_b32 s11, s7
17; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX6-NEXT:    s_mov_b32 s8, s2
19; GFX6-NEXT:    s_mov_b32 s9, s3
20; GFX6-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
21; GFX6-NEXT:    s_mov_b32 s4, s0
22; GFX6-NEXT:    s_mov_b32 s5, s1
23; GFX6-NEXT:    s_waitcnt vmcnt(0)
24; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
25; GFX6-NEXT:    v_mul_f32_e32 v0, 0x3e22f983, v0
26; GFX6-NEXT:    v_fract_f32_e32 v0, v0
27; GFX6-NEXT:    v_cos_f32_e32 v0, v0
28; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
29; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
30; GFX6-NEXT:    s_endpgm
31;
32; GFX8-LABEL: cos_f16:
33; GFX8:       ; %bb.0:
34; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
35; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
36; GFX8-NEXT:    v_mov_b32_e32 v0, s2
37; GFX8-NEXT:    v_mov_b32_e32 v1, s3
38; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
39; GFX8-NEXT:    v_mov_b32_e32 v1, s1
40; GFX8-NEXT:    s_waitcnt vmcnt(0)
41; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
42; GFX8-NEXT:    v_fract_f16_e32 v0, v0
43; GFX8-NEXT:    v_cos_f16_e32 v2, v0
44; GFX8-NEXT:    v_mov_b32_e32 v0, s0
45; GFX8-NEXT:    flat_store_short v[0:1], v2
46; GFX8-NEXT:    s_endpgm
47;
48; GFX9-LABEL: cos_f16:
49; GFX9:       ; %bb.0:
50; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
51; GFX9-NEXT:    v_mov_b32_e32 v0, 0
52; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
54; GFX9-NEXT:    s_waitcnt vmcnt(0)
55; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
56; GFX9-NEXT:    v_cos_f16_e32 v1, v1
57; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
58; GFX9-NEXT:    s_endpgm
59;
60; GFX10-LABEL: cos_f16:
61; GFX10:       ; %bb.0:
62; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
63; GFX10-NEXT:    v_mov_b32_e32 v0, 0
64; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
65; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
66; GFX10-NEXT:    s_waitcnt vmcnt(0)
67; GFX10-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
68; GFX10-NEXT:    v_cos_f16_e32 v1, v1
69; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
70; GFX10-NEXT:    s_endpgm
71;
72; GFX11-LABEL: cos_f16:
73; GFX11:       ; %bb.0:
74; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
75; GFX11-NEXT:    v_mov_b32_e32 v0, 0
76; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
78; GFX11-NEXT:    s_waitcnt vmcnt(0)
79; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
80; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
81; GFX11-NEXT:    v_cos_f16_e32 v1, v1
82; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
83; GFX11-NEXT:    s_endpgm
84;
85; GFX12-LABEL: cos_f16:
86; GFX12:       ; %bb.0:
87; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
88; GFX12-NEXT:    v_mov_b32_e32 v0, 0
89; GFX12-NEXT:    s_wait_kmcnt 0x0
90; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
91; GFX12-NEXT:    s_wait_loadcnt 0x0
92; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
93; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
94; GFX12-NEXT:    v_cos_f16_e32 v1, v1
95; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
96; GFX12-NEXT:    s_endpgm
97  %a.val = load half, ptr addrspace(1) %a
98  %r.val = call half @llvm.cos.f16(half %a.val)
99  store half %r.val, ptr addrspace(1) %r
100  ret void
101}
102
103define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
104; GFX6-LABEL: cos_v2f16:
105; GFX6:       ; %bb.0:
106; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
107; GFX6-NEXT:    s_mov_b32 s7, 0xf000
108; GFX6-NEXT:    s_mov_b32 s6, -1
109; GFX6-NEXT:    s_mov_b32 s10, s6
110; GFX6-NEXT:    s_mov_b32 s11, s7
111; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
112; GFX6-NEXT:    s_mov_b32 s8, s2
113; GFX6-NEXT:    s_mov_b32 s9, s3
114; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
115; GFX6-NEXT:    s_mov_b32 s4, s0
116; GFX6-NEXT:    s_mov_b32 s5, s1
117; GFX6-NEXT:    s_waitcnt vmcnt(0)
118; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
119; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
120; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
121; GFX6-NEXT:    v_mul_f32_e32 v1, 0x3e22f983, v1
122; GFX6-NEXT:    v_fract_f32_e32 v1, v1
123; GFX6-NEXT:    v_mul_f32_e32 v0, 0x3e22f983, v0
124; GFX6-NEXT:    v_fract_f32_e32 v0, v0
125; GFX6-NEXT:    v_cos_f32_e32 v0, v0
126; GFX6-NEXT:    v_cos_f32_e32 v1, v1
127; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
128; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
129; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
130; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
131; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
132; GFX6-NEXT:    s_endpgm
133;
134; GFX8-LABEL: cos_v2f16:
135; GFX8:       ; %bb.0:
136; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
137; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
138; GFX8-NEXT:    v_mov_b32_e32 v0, s2
139; GFX8-NEXT:    v_mov_b32_e32 v1, s3
140; GFX8-NEXT:    flat_load_dword v0, v[0:1]
141; GFX8-NEXT:    v_mov_b32_e32 v1, 0x3118
142; GFX8-NEXT:    s_waitcnt vmcnt(0)
143; GFX8-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
144; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
145; GFX8-NEXT:    v_fract_f16_e32 v1, v1
146; GFX8-NEXT:    v_fract_f16_e32 v0, v0
147; GFX8-NEXT:    v_cos_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
148; GFX8-NEXT:    v_cos_f16_e32 v3, v0
149; GFX8-NEXT:    v_mov_b32_e32 v0, s0
150; GFX8-NEXT:    v_mov_b32_e32 v1, s1
151; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
152; GFX8-NEXT:    flat_store_dword v[0:1], v2
153; GFX8-NEXT:    s_endpgm
154;
155; GFX9-LABEL: cos_v2f16:
156; GFX9:       ; %bb.0:
157; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
158; GFX9-NEXT:    v_mov_b32_e32 v0, 0
159; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3118
160; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
161; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
162; GFX9-NEXT:    s_waitcnt vmcnt(0)
163; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
164; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
165; GFX9-NEXT:    v_cos_f16_e32 v2, v3
166; GFX9-NEXT:    v_cos_f16_e32 v1, v1
167; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
168; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
169; GFX9-NEXT:    s_endpgm
170;
171; GFX10-LABEL: cos_v2f16:
172; GFX10:       ; %bb.0:
173; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
174; GFX10-NEXT:    v_mov_b32_e32 v0, 0
175; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3118
176; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
177; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
178; GFX10-NEXT:    s_waitcnt vmcnt(0)
179; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
180; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
181; GFX10-NEXT:    v_cos_f16_e32 v2, v3
182; GFX10-NEXT:    v_cos_f16_e32 v1, v1
183; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
184; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
185; GFX10-NEXT:    s_endpgm
186;
187; GFX11-LABEL: cos_v2f16:
188; GFX11:       ; %bb.0:
189; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
190; GFX11-NEXT:    v_mov_b32_e32 v0, 0
191; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
192; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
193; GFX11-NEXT:    s_waitcnt vmcnt(0)
194; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
195; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
196; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
197; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
198; GFX11-NEXT:    v_cos_f16_e32 v1, v1
199; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
200; GFX11-NEXT:    v_cos_f16_e32 v2, v2
201; GFX11-NEXT:    s_waitcnt_depctr 0xfff
202; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
203; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
204; GFX11-NEXT:    s_endpgm
205;
206; GFX12-LABEL: cos_v2f16:
207; GFX12:       ; %bb.0:
208; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
209; GFX12-NEXT:    v_mov_b32_e32 v0, 0
210; GFX12-NEXT:    s_wait_kmcnt 0x0
211; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
212; GFX12-NEXT:    s_wait_loadcnt 0x0
213; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
214; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
215; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
216; GFX12-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
217; GFX12-NEXT:    v_cos_f16_e32 v1, v1
218; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
219; GFX12-NEXT:    v_cos_f16_e32 v2, v2
220; GFX12-NEXT:    v_pack_b32_f16 v1, v1, v2
221; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
222; GFX12-NEXT:    s_endpgm
223  %a.val = load <2 x half>, ptr addrspace(1) %a
224  %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)
225  store <2 x half> %r.val, ptr addrspace(1) %r
226  ret void
227}
228
229declare half @llvm.cos.f16(half %a)
230declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
231