xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll (revision d7acf03cecef0bc62240c97a890077755323424f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
7
8declare half @llvm.rint.f16(half %a)
9declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
10
11define amdgpu_kernel void @rint_f16(
12; SI-LABEL: rint_f16:
13; SI:       ; %bb.0: ; %entry
14; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
15; SI-NEXT:    s_mov_b32 s7, 0xf000
16; SI-NEXT:    s_mov_b32 s6, -1
17; SI-NEXT:    s_mov_b32 s10, s6
18; SI-NEXT:    s_mov_b32 s11, s7
19; SI-NEXT:    s_waitcnt lgkmcnt(0)
20; SI-NEXT:    s_mov_b32 s8, s2
21; SI-NEXT:    s_mov_b32 s9, s3
22; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
23; SI-NEXT:    s_mov_b32 s4, s0
24; SI-NEXT:    s_mov_b32 s5, s1
25; SI-NEXT:    s_waitcnt vmcnt(0)
26; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
27; SI-NEXT:    v_rndne_f32_e32 v0, v0
28; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
29; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
30; SI-NEXT:    s_endpgm
31;
32; GFX89-LABEL: rint_f16:
33; GFX89:       ; %bb.0: ; %entry
34; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
35; GFX89-NEXT:    s_mov_b32 s7, 0xf000
36; GFX89-NEXT:    s_mov_b32 s6, -1
37; GFX89-NEXT:    s_mov_b32 s10, s6
38; GFX89-NEXT:    s_mov_b32 s11, s7
39; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX89-NEXT:    s_mov_b32 s8, s2
41; GFX89-NEXT:    s_mov_b32 s9, s3
42; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
43; GFX89-NEXT:    s_mov_b32 s4, s0
44; GFX89-NEXT:    s_mov_b32 s5, s1
45; GFX89-NEXT:    s_waitcnt vmcnt(0)
46; GFX89-NEXT:    v_rndne_f16_e32 v0, v0
47; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
48; GFX89-NEXT:    s_endpgm
49;
50; GFX11-LABEL: rint_f16:
51; GFX11:       ; %bb.0: ; %entry
52; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
53; GFX11-NEXT:    s_mov_b32 s6, -1
54; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
55; GFX11-NEXT:    s_mov_b32 s10, s6
56; GFX11-NEXT:    s_mov_b32 s11, s7
57; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX11-NEXT:    s_mov_b32 s8, s2
59; GFX11-NEXT:    s_mov_b32 s9, s3
60; GFX11-NEXT:    s_mov_b32 s4, s0
61; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
62; GFX11-NEXT:    s_mov_b32 s5, s1
63; GFX11-NEXT:    s_waitcnt vmcnt(0)
64; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
65; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
66; GFX11-NEXT:    s_endpgm
67;
68; GFX12-LABEL: rint_f16:
69; GFX12:       ; %bb.0: ; %entry
70; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
71; GFX12-NEXT:    s_mov_b32 s6, -1
72; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
73; GFX12-NEXT:    s_mov_b32 s10, s6
74; GFX12-NEXT:    s_mov_b32 s11, s7
75; GFX12-NEXT:    s_wait_kmcnt 0x0
76; GFX12-NEXT:    s_mov_b32 s8, s2
77; GFX12-NEXT:    s_mov_b32 s9, s3
78; GFX12-NEXT:    s_mov_b32 s4, s0
79; GFX12-NEXT:    buffer_load_u16 v0, off, s[8:11], null
80; GFX12-NEXT:    s_mov_b32 s5, s1
81; GFX12-NEXT:    s_wait_loadcnt 0x0
82; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
83; GFX12-NEXT:    buffer_store_b16 v0, off, s[4:7], null
84; GFX12-NEXT:    s_endpgm
85    ptr addrspace(1) %r,
86    ptr addrspace(1) %a) {
87entry:
88  %a.val = load half, ptr addrspace(1) %a
89  %r.val = call half @llvm.rint.f16(half %a.val)
90  store half %r.val, ptr addrspace(1) %r
91  ret void
92}
93
94; The original test with manual checks also had these NOT directives:
95; COM: SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
96; COM: SI-NOT: v_and_b32
97; COM: SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
98; COM: VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
99; COM: VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
100; COM: VI-NOT: v_and_b32
101; COM: VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
102define amdgpu_kernel void @rint_v2f16(
103; SI-LABEL: rint_v2f16:
104; SI:       ; %bb.0: ; %entry
105; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
106; SI-NEXT:    s_mov_b32 s7, 0xf000
107; SI-NEXT:    s_mov_b32 s6, -1
108; SI-NEXT:    s_mov_b32 s10, s6
109; SI-NEXT:    s_mov_b32 s11, s7
110; SI-NEXT:    s_waitcnt lgkmcnt(0)
111; SI-NEXT:    s_mov_b32 s8, s2
112; SI-NEXT:    s_mov_b32 s9, s3
113; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
114; SI-NEXT:    s_mov_b32 s4, s0
115; SI-NEXT:    s_mov_b32 s5, s1
116; SI-NEXT:    s_waitcnt vmcnt(0)
117; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
118; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
119; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
120; SI-NEXT:    v_rndne_f32_e32 v1, v1
121; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
122; SI-NEXT:    v_rndne_f32_e32 v0, v0
123; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
124; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
125; SI-NEXT:    v_or_b32_e32 v0, v0, v1
126; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
127; SI-NEXT:    s_endpgm
128;
129; VI-LABEL: rint_v2f16:
130; VI:       ; %bb.0: ; %entry
131; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
132; VI-NEXT:    s_mov_b32 s7, 0xf000
133; VI-NEXT:    s_mov_b32 s6, -1
134; VI-NEXT:    s_mov_b32 s10, s6
135; VI-NEXT:    s_mov_b32 s11, s7
136; VI-NEXT:    s_waitcnt lgkmcnt(0)
137; VI-NEXT:    s_mov_b32 s8, s2
138; VI-NEXT:    s_mov_b32 s9, s3
139; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
140; VI-NEXT:    s_mov_b32 s4, s0
141; VI-NEXT:    s_mov_b32 s5, s1
142; VI-NEXT:    s_waitcnt vmcnt(0)
143; VI-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
144; VI-NEXT:    v_rndne_f16_e32 v0, v0
145; VI-NEXT:    v_or_b32_e32 v0, v0, v1
146; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
147; VI-NEXT:    s_endpgm
148;
149; GFX9-LABEL: rint_v2f16:
150; GFX9:       ; %bb.0: ; %entry
151; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
152; GFX9-NEXT:    s_mov_b32 s7, 0xf000
153; GFX9-NEXT:    s_mov_b32 s6, -1
154; GFX9-NEXT:    s_mov_b32 s10, s6
155; GFX9-NEXT:    s_mov_b32 s11, s7
156; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX9-NEXT:    s_mov_b32 s8, s2
158; GFX9-NEXT:    s_mov_b32 s9, s3
159; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
160; GFX9-NEXT:    s_mov_b32 s4, s0
161; GFX9-NEXT:    s_mov_b32 s5, s1
162; GFX9-NEXT:    s_waitcnt vmcnt(0)
163; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
164; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
165; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
166; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
167; GFX9-NEXT:    s_endpgm
168;
169; GFX11-LABEL: rint_v2f16:
170; GFX11:       ; %bb.0: ; %entry
171; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
172; GFX11-NEXT:    s_mov_b32 s6, -1
173; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
174; GFX11-NEXT:    s_mov_b32 s10, s6
175; GFX11-NEXT:    s_mov_b32 s11, s7
176; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
177; GFX11-NEXT:    s_mov_b32 s8, s2
178; GFX11-NEXT:    s_mov_b32 s9, s3
179; GFX11-NEXT:    s_mov_b32 s4, s0
180; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
181; GFX11-NEXT:    s_mov_b32 s5, s1
182; GFX11-NEXT:    s_waitcnt vmcnt(0)
183; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
184; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
185; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
186; GFX11-NEXT:    v_rndne_f16_e32 v1, v1
187; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
188; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
189; GFX11-NEXT:    s_endpgm
190;
191; GFX12-LABEL: rint_v2f16:
192; GFX12:       ; %bb.0: ; %entry
193; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
194; GFX12-NEXT:    s_mov_b32 s6, -1
195; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
196; GFX12-NEXT:    s_mov_b32 s10, s6
197; GFX12-NEXT:    s_mov_b32 s11, s7
198; GFX12-NEXT:    s_wait_kmcnt 0x0
199; GFX12-NEXT:    s_mov_b32 s8, s2
200; GFX12-NEXT:    s_mov_b32 s9, s3
201; GFX12-NEXT:    s_mov_b32 s4, s0
202; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
203; GFX12-NEXT:    s_mov_b32 s5, s1
204; GFX12-NEXT:    s_wait_loadcnt 0x0
205; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
206; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
207; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
208; GFX12-NEXT:    v_rndne_f16_e32 v1, v1
209; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
210; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
211; GFX12-NEXT:    s_endpgm
212    ptr addrspace(1) %r,
213    ptr addrspace(1) %a) {
214entry:
215  %a.val = load <2 x half>, ptr addrspace(1) %a
216  %r.val = call <2 x half> @llvm.rint.v2f16(<2 x half> %a.val)
217  store <2 x half> %r.val, ptr addrspace(1) %r
218  ret void
219}
220