xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll (revision 6934870a134ce9000752f0613295596fb876e5c6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX940 %s
3; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX940 %s
4; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
5; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
6; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
7; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
8
9declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
10declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
11declare <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32, i1)
12declare <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32, i1)
13declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1)
14declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1)
15declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32)
16declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32)
17
18define float @test_cvt_f32_bf8_byte0(i32 %a) {
19; GFX940-LABEL: test_cvt_f32_bf8_byte0:
20; GFX940:       ; %bb.0:
21; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GFX940-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0
23; GFX940-NEXT:    s_setpc_b64 s[30:31]
24;
25; GFX950-LABEL: test_cvt_f32_bf8_byte0:
26; GFX950:       ; %bb.0:
27; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28; GFX950-NEXT:    v_cvt_f32_bf8_e32 v0, v0
29; GFX950-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX12-LABEL: test_cvt_f32_bf8_byte0:
32; GFX12:       ; %bb.0:
33; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
34; GFX12-NEXT:    s_wait_expcnt 0x0
35; GFX12-NEXT:    s_wait_samplecnt 0x0
36; GFX12-NEXT:    s_wait_bvhcnt 0x0
37; GFX12-NEXT:    s_wait_kmcnt 0x0
38; GFX12-NEXT:    v_cvt_f32_bf8_e32 v0, v0
39; GFX12-NEXT:    s_setpc_b64 s[30:31]
40  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
41  ret float %ret
42}
43
44define float @test_cvt_f32_bf8_byte1(i32 %a) {
45; GFX9X-LABEL: test_cvt_f32_bf8_byte1:
46; GFX9X:       ; %bb.0:
47; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48; GFX9X-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1
49; GFX9X-NEXT:    s_setpc_b64 s[30:31]
50;
51; GFX12-LABEL: test_cvt_f32_bf8_byte1:
52; GFX12:       ; %bb.0:
53; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
54; GFX12-NEXT:    s_wait_expcnt 0x0
55; GFX12-NEXT:    s_wait_samplecnt 0x0
56; GFX12-NEXT:    s_wait_bvhcnt 0x0
57; GFX12-NEXT:    s_wait_kmcnt 0x0
58; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
59; GFX12-NEXT:    s_setpc_b64 s[30:31]
60  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
61  ret float %ret
62}
63
64define float @test_cvt_f32_bf8_byte2(i32 %a) {
65; GFX9X-LABEL: test_cvt_f32_bf8_byte2:
66; GFX9X:       ; %bb.0:
67; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68; GFX9X-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2
69; GFX9X-NEXT:    s_setpc_b64 s[30:31]
70;
71; GFX12-LABEL: test_cvt_f32_bf8_byte2:
72; GFX12:       ; %bb.0:
73; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
74; GFX12-NEXT:    s_wait_expcnt 0x0
75; GFX12-NEXT:    s_wait_samplecnt 0x0
76; GFX12-NEXT:    s_wait_bvhcnt 0x0
77; GFX12-NEXT:    s_wait_kmcnt 0x0
78; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 byte_sel:2
79; GFX12-NEXT:    s_setpc_b64 s[30:31]
80  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
81  ret float %ret
82}
83
84define float @test_cvt_f32_bf8_byte3(i32 %a) {
85; GFX9X-LABEL: test_cvt_f32_bf8_byte3:
86; GFX9X:       ; %bb.0:
87; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88; GFX9X-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3
89; GFX9X-NEXT:    s_setpc_b64 s[30:31]
90;
91; GFX12-LABEL: test_cvt_f32_bf8_byte3:
92; GFX12:       ; %bb.0:
93; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
94; GFX12-NEXT:    s_wait_expcnt 0x0
95; GFX12-NEXT:    s_wait_samplecnt 0x0
96; GFX12-NEXT:    s_wait_bvhcnt 0x0
97; GFX12-NEXT:    s_wait_kmcnt 0x0
98; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 byte_sel:3
99; GFX12-NEXT:    s_setpc_b64 s[30:31]
100  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
101  ret float %ret
102}
103
104define float @test_cvt_f32_fp8_byte0(i32 %a) {
105; GFX940-LABEL: test_cvt_f32_fp8_byte0:
106; GFX940:       ; %bb.0:
107; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108; GFX940-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0
109; GFX940-NEXT:    s_setpc_b64 s[30:31]
110;
111; GFX950-LABEL: test_cvt_f32_fp8_byte0:
112; GFX950:       ; %bb.0:
113; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX950-NEXT:    v_cvt_f32_fp8_e32 v0, v0
115; GFX950-NEXT:    s_setpc_b64 s[30:31]
116;
117; GFX12-LABEL: test_cvt_f32_fp8_byte0:
118; GFX12:       ; %bb.0:
119; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
120; GFX12-NEXT:    s_wait_expcnt 0x0
121; GFX12-NEXT:    s_wait_samplecnt 0x0
122; GFX12-NEXT:    s_wait_bvhcnt 0x0
123; GFX12-NEXT:    s_wait_kmcnt 0x0
124; GFX12-NEXT:    v_cvt_f32_fp8_e32 v0, v0
125; GFX12-NEXT:    s_setpc_b64 s[30:31]
126  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0)
127  ret float %ret
128}
129
130define float @test_cvt_f32_fp8_byte1(i32 %a) {
131; GFX9X-LABEL: test_cvt_f32_fp8_byte1:
132; GFX9X:       ; %bb.0:
133; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GFX9X-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1
135; GFX9X-NEXT:    s_setpc_b64 s[30:31]
136;
137; GFX12-LABEL: test_cvt_f32_fp8_byte1:
138; GFX12:       ; %bb.0:
139; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
140; GFX12-NEXT:    s_wait_expcnt 0x0
141; GFX12-NEXT:    s_wait_samplecnt 0x0
142; GFX12-NEXT:    s_wait_bvhcnt 0x0
143; GFX12-NEXT:    s_wait_kmcnt 0x0
144; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
145; GFX12-NEXT:    s_setpc_b64 s[30:31]
146  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
147  ret float %ret
148}
149
150define float @test_cvt_f32_fp8_byte2(i32 %a) {
151; GFX9X-LABEL: test_cvt_f32_fp8_byte2:
152; GFX9X:       ; %bb.0:
153; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154; GFX9X-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2
155; GFX9X-NEXT:    s_setpc_b64 s[30:31]
156;
157; GFX12-LABEL: test_cvt_f32_fp8_byte2:
158; GFX12:       ; %bb.0:
159; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
160; GFX12-NEXT:    s_wait_expcnt 0x0
161; GFX12-NEXT:    s_wait_samplecnt 0x0
162; GFX12-NEXT:    s_wait_bvhcnt 0x0
163; GFX12-NEXT:    s_wait_kmcnt 0x0
164; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 byte_sel:2
165; GFX12-NEXT:    s_setpc_b64 s[30:31]
166  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
167  ret float %ret
168}
169
170define float @test_cvt_f32_fp8_byte3(i32 %a) {
171; GFX9X-LABEL: test_cvt_f32_fp8_byte3:
172; GFX9X:       ; %bb.0:
173; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
174; GFX9X-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3
175; GFX9X-NEXT:    s_setpc_b64 s[30:31]
176;
177; GFX12-LABEL: test_cvt_f32_fp8_byte3:
178; GFX12:       ; %bb.0:
179; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
180; GFX12-NEXT:    s_wait_expcnt 0x0
181; GFX12-NEXT:    s_wait_samplecnt 0x0
182; GFX12-NEXT:    s_wait_bvhcnt 0x0
183; GFX12-NEXT:    s_wait_kmcnt 0x0
184; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 byte_sel:3
185; GFX12-NEXT:    s_setpc_b64 s[30:31]
186  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
187  ret float %ret
188}
189
190define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) {
191; GFX9X-LABEL: test_cvt_pk_f32_bf8_word0:
192; GFX9X:       ; %bb.0:
193; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194; GFX9X-NEXT:    v_cvt_pk_f32_bf8_e32 v[0:1], v0
195; GFX9X-NEXT:    s_setpc_b64 s[30:31]
196;
197; GFX12-LABEL: test_cvt_pk_f32_bf8_word0:
198; GFX12:       ; %bb.0:
199; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
200; GFX12-NEXT:    s_wait_expcnt 0x0
201; GFX12-NEXT:    s_wait_samplecnt 0x0
202; GFX12-NEXT:    s_wait_bvhcnt 0x0
203; GFX12-NEXT:    s_wait_kmcnt 0x0
204; GFX12-NEXT:    v_cvt_pk_f32_bf8_e32 v[0:1], v0
205; GFX12-NEXT:    s_setpc_b64 s[30:31]
206  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
207  ret <2 x float> %ret
208}
209
210define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) {
211; GFX9X-LABEL: test_cvt_pk_f32_bf8_word1:
212; GFX9X:       ; %bb.0:
213; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214; GFX9X-NEXT:    v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1
215; GFX9X-NEXT:    s_setpc_b64 s[30:31]
216;
217; GFX12-LABEL: test_cvt_pk_f32_bf8_word1:
218; GFX12:       ; %bb.0:
219; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
220; GFX12-NEXT:    s_wait_expcnt 0x0
221; GFX12-NEXT:    s_wait_samplecnt 0x0
222; GFX12-NEXT:    s_wait_bvhcnt 0x0
223; GFX12-NEXT:    s_wait_kmcnt 0x0
224; GFX12-NEXT:    v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0]
225; GFX12-NEXT:    s_setpc_b64 s[30:31]
226  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true)
227  ret <2 x float> %ret
228}
229
230define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) {
231; GFX9X-LABEL: test_cvt_pk_f32_fp8_word0:
232; GFX9X:       ; %bb.0:
233; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234; GFX9X-NEXT:    v_cvt_pk_f32_fp8_e32 v[0:1], v0
235; GFX9X-NEXT:    s_setpc_b64 s[30:31]
236;
237; GFX12-LABEL: test_cvt_pk_f32_fp8_word0:
238; GFX12:       ; %bb.0:
239; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
240; GFX12-NEXT:    s_wait_expcnt 0x0
241; GFX12-NEXT:    s_wait_samplecnt 0x0
242; GFX12-NEXT:    s_wait_bvhcnt 0x0
243; GFX12-NEXT:    s_wait_kmcnt 0x0
244; GFX12-NEXT:    v_cvt_pk_f32_fp8_e32 v[0:1], v0
245; GFX12-NEXT:    s_setpc_b64 s[30:31]
246  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false)
247  ret <2 x float> %ret
248}
249
250define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) {
251; GFX9X-LABEL: test_cvt_pk_f32_fp8_word1:
252; GFX9X:       ; %bb.0:
253; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254; GFX9X-NEXT:    v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1
255; GFX9X-NEXT:    s_setpc_b64 s[30:31]
256;
257; GFX12-LABEL: test_cvt_pk_f32_fp8_word1:
258; GFX12:       ; %bb.0:
259; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
260; GFX12-NEXT:    s_wait_expcnt 0x0
261; GFX12-NEXT:    s_wait_samplecnt 0x0
262; GFX12-NEXT:    s_wait_bvhcnt 0x0
263; GFX12-NEXT:    s_wait_kmcnt 0x0
264; GFX12-NEXT:    v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0]
265; GFX12-NEXT:    s_setpc_b64 s[30:31]
266  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
267  ret <2 x float> %ret
268}
269
270define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
271; GFX9X-LABEL: test_cvt_pk_bf8_f32_word0:
272; GFX9X:       ; %bb.0:
273; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274; GFX9X-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1
275; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
276; GFX9X-NEXT:    s_setpc_b64 s[30:31]
277;
278; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
279; GFX12:       ; %bb.0:
280; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
281; GFX12-NEXT:    s_wait_expcnt 0x0
282; GFX12-NEXT:    s_wait_samplecnt 0x0
283; GFX12-NEXT:    s_wait_bvhcnt 0x0
284; GFX12-NEXT:    s_wait_kmcnt 0x0
285; GFX12-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1
286; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
287; GFX12-NEXT:    v_mov_b32_e32 v0, v2
288; GFX12-NEXT:    s_setpc_b64 s[30:31]
289  %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
290  ret i32 %ret
291}
292
293define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
294; GFX9X-LABEL: test_cvt_pk_bf8_f32_word1:
295; GFX9X:       ; %bb.0:
296; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; GFX9X-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
298; GFX9X-NEXT:    s_nop 0
299; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
300; GFX9X-NEXT:    s_setpc_b64 s[30:31]
301;
302; GFX12-LABEL: test_cvt_pk_bf8_f32_word1:
303; GFX12:       ; %bb.0:
304; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
305; GFX12-NEXT:    s_wait_expcnt 0x0
306; GFX12-NEXT:    s_wait_samplecnt 0x0
307; GFX12-NEXT:    s_wait_bvhcnt 0x0
308; GFX12-NEXT:    s_wait_kmcnt 0x0
309; GFX12-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
310; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
311; GFX12-NEXT:    v_mov_b32_e32 v0, v2
312; GFX12-NEXT:    s_setpc_b64 s[30:31]
313  %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
314  ret i32 %ret
315}
316
317define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
318; GFX9X-LABEL: test_cvt_pk_fp8_f32_word0:
319; GFX9X:       ; %bb.0:
320; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
321; GFX9X-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1
322; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
323; GFX9X-NEXT:    s_setpc_b64 s[30:31]
324;
325; GFX12-LABEL: test_cvt_pk_fp8_f32_word0:
326; GFX12:       ; %bb.0:
327; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
328; GFX12-NEXT:    s_wait_expcnt 0x0
329; GFX12-NEXT:    s_wait_samplecnt 0x0
330; GFX12-NEXT:    s_wait_bvhcnt 0x0
331; GFX12-NEXT:    s_wait_kmcnt 0x0
332; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1
333; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
334; GFX12-NEXT:    v_mov_b32_e32 v0, v2
335; GFX12-NEXT:    s_setpc_b64 s[30:31]
336  %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
337  ret i32 %ret
338}
339
340define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
341; GFX9X-LABEL: test_cvt_pk_fp8_f32_word1:
342; GFX9X:       ; %bb.0:
343; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; GFX9X-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
345; GFX9X-NEXT:    s_nop 0
346; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
347; GFX9X-NEXT:    s_setpc_b64 s[30:31]
348;
349; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
350; GFX12:       ; %bb.0:
351; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
352; GFX12-NEXT:    s_wait_expcnt 0x0
353; GFX12-NEXT:    s_wait_samplecnt 0x0
354; GFX12-NEXT:    s_wait_bvhcnt 0x0
355; GFX12-NEXT:    s_wait_kmcnt 0x0
356; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
357; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
358; GFX12-NEXT:    v_mov_b32_e32 v0, v2
359; GFX12-NEXT:    s_setpc_b64 s[30:31]
360  %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
361  ret i32 %ret
362}
363
364define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) {
365; GFX9X-LABEL: test_cvt_sr_bf8_f32_byte0:
366; GFX9X:       ; %bb.0:
367; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
368; GFX9X-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1
369; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
370; GFX9X-NEXT:    s_setpc_b64 s[30:31]
371;
372; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0:
373; GFX12:       ; %bb.0:
374; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
375; GFX12-NEXT:    s_wait_expcnt 0x0
376; GFX12-NEXT:    s_wait_samplecnt 0x0
377; GFX12-NEXT:    s_wait_bvhcnt 0x0
378; GFX12-NEXT:    s_wait_kmcnt 0x0
379; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1
380; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
381; GFX12-NEXT:    v_mov_b32_e32 v0, v2
382; GFX12-NEXT:    s_setpc_b64 s[30:31]
383  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0)
384  ret i32 %ret
385}
386
387define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
388; GFX9X-LABEL: test_cvt_sr_bf8_f32_byte1:
389; GFX9X:       ; %bb.0:
390; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391; GFX9X-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
392; GFX9X-NEXT:    s_nop 0
393; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
394; GFX9X-NEXT:    s_setpc_b64 s[30:31]
395;
396; GFX12-LABEL: test_cvt_sr_bf8_f32_byte1:
397; GFX12:       ; %bb.0:
398; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
399; GFX12-NEXT:    s_wait_expcnt 0x0
400; GFX12-NEXT:    s_wait_samplecnt 0x0
401; GFX12-NEXT:    s_wait_bvhcnt 0x0
402; GFX12-NEXT:    s_wait_kmcnt 0x0
403; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:1
404; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
405; GFX12-NEXT:    v_mov_b32_e32 v0, v2
406; GFX12-NEXT:    s_setpc_b64 s[30:31]
407  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1)
408  ret i32 %ret
409}
410
411define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
412; GFX9X-LABEL: test_cvt_sr_bf8_f32_byte2:
413; GFX9X:       ; %bb.0:
414; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415; GFX9X-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
416; GFX9X-NEXT:    s_nop 0
417; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
418; GFX9X-NEXT:    s_setpc_b64 s[30:31]
419;
420; GFX12-LABEL: test_cvt_sr_bf8_f32_byte2:
421; GFX12:       ; %bb.0:
422; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
423; GFX12-NEXT:    s_wait_expcnt 0x0
424; GFX12-NEXT:    s_wait_samplecnt 0x0
425; GFX12-NEXT:    s_wait_bvhcnt 0x0
426; GFX12-NEXT:    s_wait_kmcnt 0x0
427; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:2
428; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
429; GFX12-NEXT:    v_mov_b32_e32 v0, v2
430; GFX12-NEXT:    s_setpc_b64 s[30:31]
431  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2)
432  ret i32 %ret
433}
434
435define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
436; GFX9X-LABEL: test_cvt_sr_bf8_f32_byte3:
437; GFX9X:       ; %bb.0:
438; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
439; GFX9X-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
440; GFX9X-NEXT:    s_nop 0
441; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
442; GFX9X-NEXT:    s_setpc_b64 s[30:31]
443;
444; GFX12-LABEL: test_cvt_sr_bf8_f32_byte3:
445; GFX12:       ; %bb.0:
446; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
447; GFX12-NEXT:    s_wait_expcnt 0x0
448; GFX12-NEXT:    s_wait_samplecnt 0x0
449; GFX12-NEXT:    s_wait_bvhcnt 0x0
450; GFX12-NEXT:    s_wait_kmcnt 0x0
451; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:3
452; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
453; GFX12-NEXT:    v_mov_b32_e32 v0, v2
454; GFX12-NEXT:    s_setpc_b64 s[30:31]
455  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3)
456  ret i32 %ret
457}
458
459define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) {
460; GFX9X-LABEL: test_cvt_sr_fp8_f32_byte0:
461; GFX9X:       ; %bb.0:
462; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463; GFX9X-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1
464; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
465; GFX9X-NEXT:    s_setpc_b64 s[30:31]
466;
467; GFX12-LABEL: test_cvt_sr_fp8_f32_byte0:
468; GFX12:       ; %bb.0:
469; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
470; GFX12-NEXT:    s_wait_expcnt 0x0
471; GFX12-NEXT:    s_wait_samplecnt 0x0
472; GFX12-NEXT:    s_wait_bvhcnt 0x0
473; GFX12-NEXT:    s_wait_kmcnt 0x0
474; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1
475; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
476; GFX12-NEXT:    v_mov_b32_e32 v0, v2
477; GFX12-NEXT:    s_setpc_b64 s[30:31]
478  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0)
479  ret i32 %ret
480}
481
482define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
483; GFX9X-LABEL: test_cvt_sr_fp8_f32_byte1:
484; GFX9X:       ; %bb.0:
485; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486; GFX9X-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
487; GFX9X-NEXT:    s_nop 0
488; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
489; GFX9X-NEXT:    s_setpc_b64 s[30:31]
490;
491; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
492; GFX12:       ; %bb.0:
493; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
494; GFX12-NEXT:    s_wait_expcnt 0x0
495; GFX12-NEXT:    s_wait_samplecnt 0x0
496; GFX12-NEXT:    s_wait_bvhcnt 0x0
497; GFX12-NEXT:    s_wait_kmcnt 0x0
498; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:1
499; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
500; GFX12-NEXT:    v_mov_b32_e32 v0, v2
501; GFX12-NEXT:    s_setpc_b64 s[30:31]
502  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1)
503  ret i32 %ret
504}
505
506define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
507; GFX9X-LABEL: test_cvt_sr_fp8_f32_byte2:
508; GFX9X:       ; %bb.0:
509; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
510; GFX9X-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
511; GFX9X-NEXT:    s_nop 0
512; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
513; GFX9X-NEXT:    s_setpc_b64 s[30:31]
514;
515; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
516; GFX12:       ; %bb.0:
517; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
518; GFX12-NEXT:    s_wait_expcnt 0x0
519; GFX12-NEXT:    s_wait_samplecnt 0x0
520; GFX12-NEXT:    s_wait_bvhcnt 0x0
521; GFX12-NEXT:    s_wait_kmcnt 0x0
522; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:2
523; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
524; GFX12-NEXT:    v_mov_b32_e32 v0, v2
525; GFX12-NEXT:    s_setpc_b64 s[30:31]
526  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2)
527  ret i32 %ret
528}
529
530define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
531; GFX9X-LABEL: test_cvt_sr_fp8_f32_byte3:
532; GFX9X:       ; %bb.0:
533; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX9X-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
535; GFX9X-NEXT:    s_nop 0
536; GFX9X-NEXT:    v_mov_b32_e32 v0, v2
537; GFX9X-NEXT:    s_setpc_b64 s[30:31]
538;
539; GFX12-LABEL: test_cvt_sr_fp8_f32_byte3:
540; GFX12:       ; %bb.0:
541; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
542; GFX12-NEXT:    s_wait_expcnt 0x0
543; GFX12-NEXT:    s_wait_samplecnt 0x0
544; GFX12-NEXT:    s_wait_bvhcnt 0x0
545; GFX12-NEXT:    s_wait_kmcnt 0x0
546; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:3
547; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
548; GFX12-NEXT:    v_mov_b32_e32 v0, v2
549; GFX12-NEXT:    s_setpc_b64 s[30:31]
550  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3)
551  ret i32 %ret
552}
553
554define float @test_sext_cvt_f32_fp8(i16 %a) {
555; GFX9X-LABEL: test_sext_cvt_f32_fp8:
556; GFX9X:       ; %bb.0:
557; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
558; GFX9X-NEXT:    v_bfe_i32 v0, v0, 0, 16
559; GFX9X-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1
560; GFX9X-NEXT:    s_setpc_b64 s[30:31]
561;
562; GFX12-LABEL: test_sext_cvt_f32_fp8:
563; GFX12:       ; %bb.0:
564; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
565; GFX12-NEXT:    s_wait_expcnt 0x0
566; GFX12-NEXT:    s_wait_samplecnt 0x0
567; GFX12-NEXT:    s_wait_bvhcnt 0x0
568; GFX12-NEXT:    s_wait_kmcnt 0x0
569; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
570; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
571; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
572; GFX12-NEXT:    s_setpc_b64 s[30:31]
573  %a.sext = sext i16 %a to i32
574  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a.sext, i32 1)
575  ret float %ret
576}
577
578define float @test_sext_cvt_f32_bf8(i16 %a) {
579; GFX9X-LABEL: test_sext_cvt_f32_bf8:
580; GFX9X:       ; %bb.0:
581; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX9X-NEXT:    v_bfe_i32 v0, v0, 0, 16
583; GFX9X-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1
584; GFX9X-NEXT:    s_setpc_b64 s[30:31]
585;
586; GFX12-LABEL: test_sext_cvt_f32_bf8:
587; GFX12:       ; %bb.0:
588; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
589; GFX12-NEXT:    s_wait_expcnt 0x0
590; GFX12-NEXT:    s_wait_samplecnt 0x0
591; GFX12-NEXT:    s_wait_bvhcnt 0x0
592; GFX12-NEXT:    s_wait_kmcnt 0x0
593; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
594; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
595; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
596; GFX12-NEXT:    s_setpc_b64 s[30:31]
597  %a.sext = sext i16 %a to i32
598  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a.sext, i32 1)
599  ret float %ret
600}
601
602define <2 x float> @test_sext_cvt_pk_f32_bf8_word1(i16 %a) {
603; GFX9X-LABEL: test_sext_cvt_pk_f32_bf8_word1:
604; GFX9X:       ; %bb.0:
605; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
606; GFX9X-NEXT:    v_bfe_i32 v0, v0, 0, 16
607; GFX9X-NEXT:    v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1
608; GFX9X-NEXT:    s_setpc_b64 s[30:31]
609;
610; GFX12-LABEL: test_sext_cvt_pk_f32_bf8_word1:
611; GFX12:       ; %bb.0:
612; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
613; GFX12-NEXT:    s_wait_expcnt 0x0
614; GFX12-NEXT:    s_wait_samplecnt 0x0
615; GFX12-NEXT:    s_wait_bvhcnt 0x0
616; GFX12-NEXT:    s_wait_kmcnt 0x0
617; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
618; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
619; GFX12-NEXT:    v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0]
620; GFX12-NEXT:    s_setpc_b64 s[30:31]
621  %a.sext = sext i16 %a to i32
622  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a.sext, i1 true)
623  ret <2 x float> %ret
624}
625
626define <2 x float> @test_sext_cvt_pk_f32_fp8_word0(i16 %a) {
627; GFX9X-LABEL: test_sext_cvt_pk_f32_fp8_word0:
628; GFX9X:       ; %bb.0:
629; GFX9X-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
630; GFX9X-NEXT:    v_bfe_i32 v0, v0, 0, 16
631; GFX9X-NEXT:    v_cvt_pk_f32_fp8_e32 v[0:1], v0
632; GFX9X-NEXT:    s_setpc_b64 s[30:31]
633;
634; GFX12-LABEL: test_sext_cvt_pk_f32_fp8_word0:
635; GFX12:       ; %bb.0:
636; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
637; GFX12-NEXT:    s_wait_expcnt 0x0
638; GFX12-NEXT:    s_wait_samplecnt 0x0
639; GFX12-NEXT:    s_wait_bvhcnt 0x0
640; GFX12-NEXT:    s_wait_kmcnt 0x0
641; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
642; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
643; GFX12-NEXT:    v_cvt_pk_f32_fp8_e32 v[0:1], v0
644; GFX12-NEXT:    s_setpc_b64 s[30:31]
645  %a.sext = sext i16 %a to i32
646  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a.sext, i1 false)
647  ret <2 x float> %ret
648}
649