1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX940 %s 3; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX940 %s 4; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s 5; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s 6; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s 7; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s 8 9declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) 10declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) 11declare <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32, i1) 12declare <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32, i1) 13declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1) 14declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1) 15declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) 16declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) 17 18define float @test_cvt_f32_bf8_byte0(i32 %a) { 19; GFX940-LABEL: test_cvt_f32_bf8_byte0: 20; GFX940: ; %bb.0: 21; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0 23; GFX940-NEXT: s_setpc_b64 s[30:31] 24; 25; GFX950-LABEL: test_cvt_f32_bf8_byte0: 26; GFX950: ; %bb.0: 27; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28; GFX950-NEXT: v_cvt_f32_bf8_e32 v0, v0 29; GFX950-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX12-LABEL: test_cvt_f32_bf8_byte0: 32; GFX12: ; %bb.0: 33; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 34; GFX12-NEXT: s_wait_expcnt 0x0 35; GFX12-NEXT: s_wait_samplecnt 0x0 36; GFX12-NEXT: s_wait_bvhcnt 0x0 37; GFX12-NEXT: s_wait_kmcnt 0x0 38; GFX12-NEXT: v_cvt_f32_bf8_e32 v0, v0 39; GFX12-NEXT: s_setpc_b64 s[30:31] 40 %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) 41 ret float %ret 42} 43 44define float @test_cvt_f32_bf8_byte1(i32 %a) { 45; GFX9X-LABEL: test_cvt_f32_bf8_byte1: 46; GFX9X: ; %bb.0: 47; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 48; GFX9X-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 49; GFX9X-NEXT: s_setpc_b64 s[30:31] 50; 51; GFX12-LABEL: test_cvt_f32_bf8_byte1: 52; GFX12: ; %bb.0: 53; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 54; GFX12-NEXT: s_wait_expcnt 0x0 55; GFX12-NEXT: s_wait_samplecnt 0x0 56; GFX12-NEXT: s_wait_bvhcnt 0x0 57; GFX12-NEXT: s_wait_kmcnt 0x0 58; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1 59; GFX12-NEXT: s_setpc_b64 s[30:31] 60 %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1) 61 ret float %ret 62} 63 64define float @test_cvt_f32_bf8_byte2(i32 %a) { 65; GFX9X-LABEL: test_cvt_f32_bf8_byte2: 66; GFX9X: ; %bb.0: 67; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 68; GFX9X-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2 69; GFX9X-NEXT: s_setpc_b64 s[30:31] 70; 71; GFX12-LABEL: test_cvt_f32_bf8_byte2: 72; GFX12: ; %bb.0: 73; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 74; GFX12-NEXT: s_wait_expcnt 0x0 75; GFX12-NEXT: s_wait_samplecnt 0x0 76; GFX12-NEXT: s_wait_bvhcnt 0x0 77; GFX12-NEXT: s_wait_kmcnt 0x0 78; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:2 79; GFX12-NEXT: s_setpc_b64 s[30:31] 80 %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2) 81 ret float %ret 82} 83 84define float @test_cvt_f32_bf8_byte3(i32 %a) { 85; GFX9X-LABEL: test_cvt_f32_bf8_byte3: 86; GFX9X: ; %bb.0: 87; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 88; GFX9X-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3 89; GFX9X-NEXT: s_setpc_b64 s[30:31] 90; 91; GFX12-LABEL: test_cvt_f32_bf8_byte3: 92; GFX12: ; %bb.0: 93; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 94; GFX12-NEXT: s_wait_expcnt 0x0 95; GFX12-NEXT: s_wait_samplecnt 0x0 96; GFX12-NEXT: s_wait_bvhcnt 0x0 97; GFX12-NEXT: s_wait_kmcnt 0x0 98; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:3 99; GFX12-NEXT: s_setpc_b64 s[30:31] 100 %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3) 101 ret float %ret 102} 103 104define float @test_cvt_f32_fp8_byte0(i32 %a) { 105; GFX940-LABEL: test_cvt_f32_fp8_byte0: 106; GFX940: ; %bb.0: 107; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0 109; GFX940-NEXT: s_setpc_b64 s[30:31] 110; 111; GFX950-LABEL: test_cvt_f32_fp8_byte0: 112; GFX950: ; %bb.0: 113; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX950-NEXT: v_cvt_f32_fp8_e32 v0, v0 115; GFX950-NEXT: s_setpc_b64 s[30:31] 116; 117; GFX12-LABEL: test_cvt_f32_fp8_byte0: 118; GFX12: ; %bb.0: 119; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 120; GFX12-NEXT: s_wait_expcnt 0x0 121; GFX12-NEXT: s_wait_samplecnt 0x0 122; GFX12-NEXT: s_wait_bvhcnt 0x0 123; GFX12-NEXT: s_wait_kmcnt 0x0 124; GFX12-NEXT: v_cvt_f32_fp8_e32 v0, v0 125; GFX12-NEXT: s_setpc_b64 s[30:31] 126 %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0) 127 ret float %ret 128} 129 130define float @test_cvt_f32_fp8_byte1(i32 %a) { 131; GFX9X-LABEL: test_cvt_f32_fp8_byte1: 132; GFX9X: ; %bb.0: 133; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 134; GFX9X-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 135; GFX9X-NEXT: s_setpc_b64 s[30:31] 136; 137; GFX12-LABEL: test_cvt_f32_fp8_byte1: 138; GFX12: ; %bb.0: 139; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 140; GFX12-NEXT: s_wait_expcnt 0x0 141; GFX12-NEXT: s_wait_samplecnt 0x0 142; GFX12-NEXT: s_wait_bvhcnt 0x0 143; GFX12-NEXT: s_wait_kmcnt 0x0 144; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1 145; GFX12-NEXT: s_setpc_b64 s[30:31] 146 %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) 147 ret float %ret 148} 149 150define float @test_cvt_f32_fp8_byte2(i32 %a) { 151; GFX9X-LABEL: test_cvt_f32_fp8_byte2: 152; GFX9X: ; %bb.0: 153; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 154; GFX9X-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2 155; GFX9X-NEXT: s_setpc_b64 s[30:31] 156; 157; GFX12-LABEL: test_cvt_f32_fp8_byte2: 158; GFX12: ; %bb.0: 159; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 160; GFX12-NEXT: s_wait_expcnt 0x0 161; GFX12-NEXT: s_wait_samplecnt 0x0 162; GFX12-NEXT: s_wait_bvhcnt 0x0 163; GFX12-NEXT: s_wait_kmcnt 0x0 164; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:2 165; GFX12-NEXT: s_setpc_b64 s[30:31] 166 %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2) 167 ret float %ret 168} 169 170define float @test_cvt_f32_fp8_byte3(i32 %a) { 171; GFX9X-LABEL: test_cvt_f32_fp8_byte3: 172; GFX9X: ; %bb.0: 173; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 174; GFX9X-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3 175; GFX9X-NEXT: s_setpc_b64 s[30:31] 176; 177; GFX12-LABEL: test_cvt_f32_fp8_byte3: 178; GFX12: ; %bb.0: 179; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 180; GFX12-NEXT: s_wait_expcnt 0x0 181; GFX12-NEXT: s_wait_samplecnt 0x0 182; GFX12-NEXT: s_wait_bvhcnt 0x0 183; GFX12-NEXT: s_wait_kmcnt 0x0 184; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:3 185; GFX12-NEXT: s_setpc_b64 s[30:31] 186 %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3) 187 ret float %ret 188} 189 190define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) { 191; GFX9X-LABEL: test_cvt_pk_f32_bf8_word0: 192; GFX9X: ; %bb.0: 193; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; GFX9X-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 195; GFX9X-NEXT: s_setpc_b64 s[30:31] 196; 197; GFX12-LABEL: test_cvt_pk_f32_bf8_word0: 198; GFX12: ; %bb.0: 199; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 200; GFX12-NEXT: s_wait_expcnt 0x0 201; GFX12-NEXT: s_wait_samplecnt 0x0 202; GFX12-NEXT: s_wait_bvhcnt 0x0 203; GFX12-NEXT: s_wait_kmcnt 0x0 204; GFX12-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 205; GFX12-NEXT: s_setpc_b64 s[30:31] 206 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) 207 ret <2 x float> %ret 208} 209 210define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) { 211; GFX9X-LABEL: test_cvt_pk_f32_bf8_word1: 212; GFX9X: ; %bb.0: 213; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 214; GFX9X-NEXT: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 215; GFX9X-NEXT: s_setpc_b64 s[30:31] 216; 217; GFX12-LABEL: test_cvt_pk_f32_bf8_word1: 218; GFX12: ; %bb.0: 219; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 220; GFX12-NEXT: s_wait_expcnt 0x0 221; GFX12-NEXT: s_wait_samplecnt 0x0 222; GFX12-NEXT: s_wait_bvhcnt 0x0 223; GFX12-NEXT: s_wait_kmcnt 0x0 224; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] 225; GFX12-NEXT: s_setpc_b64 s[30:31] 226 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true) 227 ret <2 x float> %ret 228} 229 230define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) { 231; GFX9X-LABEL: test_cvt_pk_f32_fp8_word0: 232; GFX9X: ; %bb.0: 233; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 234; GFX9X-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 235; GFX9X-NEXT: s_setpc_b64 s[30:31] 236; 237; GFX12-LABEL: test_cvt_pk_f32_fp8_word0: 238; GFX12: ; %bb.0: 239; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 240; GFX12-NEXT: s_wait_expcnt 0x0 241; GFX12-NEXT: s_wait_samplecnt 0x0 242; GFX12-NEXT: s_wait_bvhcnt 0x0 243; GFX12-NEXT: s_wait_kmcnt 0x0 244; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 245; GFX12-NEXT: s_setpc_b64 s[30:31] 246 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false) 247 ret <2 x float> %ret 248} 249 250define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) { 251; GFX9X-LABEL: test_cvt_pk_f32_fp8_word1: 252; GFX9X: ; %bb.0: 253; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 254; GFX9X-NEXT: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 255; GFX9X-NEXT: s_setpc_b64 s[30:31] 256; 257; GFX12-LABEL: test_cvt_pk_f32_fp8_word1: 258; GFX12: ; %bb.0: 259; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 260; GFX12-NEXT: s_wait_expcnt 0x0 261; GFX12-NEXT: s_wait_samplecnt 0x0 262; GFX12-NEXT: s_wait_bvhcnt 0x0 263; GFX12-NEXT: s_wait_kmcnt 0x0 264; GFX12-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] 265; GFX12-NEXT: s_setpc_b64 s[30:31] 266 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) 267 ret <2 x float> %ret 268} 269 270define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) { 271; GFX9X-LABEL: test_cvt_pk_bf8_f32_word0: 272; GFX9X: ; %bb.0: 273; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 274; GFX9X-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 275; GFX9X-NEXT: v_mov_b32_e32 v0, v2 276; GFX9X-NEXT: s_setpc_b64 s[30:31] 277; 278; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: 279; GFX12: ; %bb.0: 280; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 281; GFX12-NEXT: s_wait_expcnt 0x0 282; GFX12-NEXT: s_wait_samplecnt 0x0 283; GFX12-NEXT: s_wait_bvhcnt 0x0 284; GFX12-NEXT: s_wait_kmcnt 0x0 285; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 286; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 287; GFX12-NEXT: v_mov_b32_e32 v0, v2 288; GFX12-NEXT: s_setpc_b64 s[30:31] 289 %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false) 290 ret i32 %ret 291} 292 293define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) { 294; GFX9X-LABEL: test_cvt_pk_bf8_f32_word1: 295; GFX9X: ; %bb.0: 296; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 297; GFX9X-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] 298; GFX9X-NEXT: s_nop 0 299; GFX9X-NEXT: v_mov_b32_e32 v0, v2 300; GFX9X-NEXT: s_setpc_b64 s[30:31] 301; 302; GFX12-LABEL: test_cvt_pk_bf8_f32_word1: 303; GFX12: ; %bb.0: 304; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 305; GFX12-NEXT: s_wait_expcnt 0x0 306; GFX12-NEXT: s_wait_samplecnt 0x0 307; GFX12-NEXT: s_wait_bvhcnt 0x0 308; GFX12-NEXT: s_wait_kmcnt 0x0 309; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] 310; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 311; GFX12-NEXT: v_mov_b32_e32 v0, v2 312; GFX12-NEXT: s_setpc_b64 s[30:31] 313 %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true) 314 ret i32 %ret 315} 316 317define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) { 318; GFX9X-LABEL: test_cvt_pk_fp8_f32_word0: 319; GFX9X: ; %bb.0: 320; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 321; GFX9X-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 322; GFX9X-NEXT: v_mov_b32_e32 v0, v2 323; GFX9X-NEXT: s_setpc_b64 s[30:31] 324; 325; GFX12-LABEL: test_cvt_pk_fp8_f32_word0: 326; GFX12: ; %bb.0: 327; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 328; GFX12-NEXT: s_wait_expcnt 0x0 329; GFX12-NEXT: s_wait_samplecnt 0x0 330; GFX12-NEXT: s_wait_bvhcnt 0x0 331; GFX12-NEXT: s_wait_kmcnt 0x0 332; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 333; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 334; GFX12-NEXT: v_mov_b32_e32 v0, v2 335; GFX12-NEXT: s_setpc_b64 s[30:31] 336 %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false) 337 ret i32 %ret 338} 339 340define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) { 341; GFX9X-LABEL: test_cvt_pk_fp8_f32_word1: 342; GFX9X: ; %bb.0: 343; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX9X-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] 345; GFX9X-NEXT: s_nop 0 346; GFX9X-NEXT: v_mov_b32_e32 v0, v2 347; GFX9X-NEXT: s_setpc_b64 s[30:31] 348; 349; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: 350; GFX12: ; %bb.0: 351; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 352; GFX12-NEXT: s_wait_expcnt 0x0 353; GFX12-NEXT: s_wait_samplecnt 0x0 354; GFX12-NEXT: s_wait_bvhcnt 0x0 355; GFX12-NEXT: s_wait_kmcnt 0x0 356; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] 357; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 358; GFX12-NEXT: v_mov_b32_e32 v0, v2 359; GFX12-NEXT: s_setpc_b64 s[30:31] 360 %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true) 361 ret i32 %ret 362} 363 364define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) { 365; GFX9X-LABEL: test_cvt_sr_bf8_f32_byte0: 366; GFX9X: ; %bb.0: 367; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 368; GFX9X-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 369; GFX9X-NEXT: v_mov_b32_e32 v0, v2 370; GFX9X-NEXT: s_setpc_b64 s[30:31] 371; 372; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: 373; GFX12: ; %bb.0: 374; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 375; GFX12-NEXT: s_wait_expcnt 0x0 376; GFX12-NEXT: s_wait_samplecnt 0x0 377; GFX12-NEXT: s_wait_bvhcnt 0x0 378; GFX12-NEXT: s_wait_kmcnt 0x0 379; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 380; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 381; GFX12-NEXT: v_mov_b32_e32 v0, v2 382; GFX12-NEXT: s_setpc_b64 s[30:31] 383 %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0) 384 ret i32 %ret 385} 386 387define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) { 388; GFX9X-LABEL: test_cvt_sr_bf8_f32_byte1: 389; GFX9X: ; %bb.0: 390; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 391; GFX9X-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] 392; GFX9X-NEXT: s_nop 0 393; GFX9X-NEXT: v_mov_b32_e32 v0, v2 394; GFX9X-NEXT: s_setpc_b64 s[30:31] 395; 396; GFX12-LABEL: test_cvt_sr_bf8_f32_byte1: 397; GFX12: ; %bb.0: 398; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 399; GFX12-NEXT: s_wait_expcnt 0x0 400; GFX12-NEXT: s_wait_samplecnt 0x0 401; GFX12-NEXT: s_wait_bvhcnt 0x0 402; GFX12-NEXT: s_wait_kmcnt 0x0 403; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:1 404; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 405; GFX12-NEXT: v_mov_b32_e32 v0, v2 406; GFX12-NEXT: s_setpc_b64 s[30:31] 407 %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1) 408 ret i32 %ret 409} 410 411define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) { 412; GFX9X-LABEL: test_cvt_sr_bf8_f32_byte2: 413; GFX9X: ; %bb.0: 414; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 415; GFX9X-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] 416; GFX9X-NEXT: s_nop 0 417; GFX9X-NEXT: v_mov_b32_e32 v0, v2 418; GFX9X-NEXT: s_setpc_b64 s[30:31] 419; 420; GFX12-LABEL: test_cvt_sr_bf8_f32_byte2: 421; GFX12: ; %bb.0: 422; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 423; GFX12-NEXT: s_wait_expcnt 0x0 424; GFX12-NEXT: s_wait_samplecnt 0x0 425; GFX12-NEXT: s_wait_bvhcnt 0x0 426; GFX12-NEXT: s_wait_kmcnt 0x0 427; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:2 428; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 429; GFX12-NEXT: v_mov_b32_e32 v0, v2 430; GFX12-NEXT: s_setpc_b64 s[30:31] 431 %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2) 432 ret i32 %ret 433} 434 435define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) { 436; GFX9X-LABEL: test_cvt_sr_bf8_f32_byte3: 437; GFX9X: ; %bb.0: 438; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 439; GFX9X-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] 440; GFX9X-NEXT: s_nop 0 441; GFX9X-NEXT: v_mov_b32_e32 v0, v2 442; GFX9X-NEXT: s_setpc_b64 s[30:31] 443; 444; GFX12-LABEL: test_cvt_sr_bf8_f32_byte3: 445; GFX12: ; %bb.0: 446; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 447; GFX12-NEXT: s_wait_expcnt 0x0 448; GFX12-NEXT: s_wait_samplecnt 0x0 449; GFX12-NEXT: s_wait_bvhcnt 0x0 450; GFX12-NEXT: s_wait_kmcnt 0x0 451; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:3 452; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 453; GFX12-NEXT: v_mov_b32_e32 v0, v2 454; GFX12-NEXT: s_setpc_b64 s[30:31] 455 %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3) 456 ret i32 %ret 457} 458 459define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) { 460; GFX9X-LABEL: test_cvt_sr_fp8_f32_byte0: 461; GFX9X: ; %bb.0: 462; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 463; GFX9X-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 464; GFX9X-NEXT: v_mov_b32_e32 v0, v2 465; GFX9X-NEXT: s_setpc_b64 s[30:31] 466; 467; GFX12-LABEL: test_cvt_sr_fp8_f32_byte0: 468; GFX12: ; %bb.0: 469; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 470; GFX12-NEXT: s_wait_expcnt 0x0 471; GFX12-NEXT: s_wait_samplecnt 0x0 472; GFX12-NEXT: s_wait_bvhcnt 0x0 473; GFX12-NEXT: s_wait_kmcnt 0x0 474; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 475; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 476; GFX12-NEXT: v_mov_b32_e32 v0, v2 477; GFX12-NEXT: s_setpc_b64 s[30:31] 478 %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0) 479 ret i32 %ret 480} 481 482define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) { 483; GFX9X-LABEL: test_cvt_sr_fp8_f32_byte1: 484; GFX9X: ; %bb.0: 485; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 486; GFX9X-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] 487; GFX9X-NEXT: s_nop 0 488; GFX9X-NEXT: v_mov_b32_e32 v0, v2 489; GFX9X-NEXT: s_setpc_b64 s[30:31] 490; 491; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: 492; GFX12: ; %bb.0: 493; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 494; GFX12-NEXT: s_wait_expcnt 0x0 495; GFX12-NEXT: s_wait_samplecnt 0x0 496; GFX12-NEXT: s_wait_bvhcnt 0x0 497; GFX12-NEXT: s_wait_kmcnt 0x0 498; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:1 499; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 500; GFX12-NEXT: v_mov_b32_e32 v0, v2 501; GFX12-NEXT: s_setpc_b64 s[30:31] 502 %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1) 503 ret i32 %ret 504} 505 506define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) { 507; GFX9X-LABEL: test_cvt_sr_fp8_f32_byte2: 508; GFX9X: ; %bb.0: 509; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 510; GFX9X-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] 511; GFX9X-NEXT: s_nop 0 512; GFX9X-NEXT: v_mov_b32_e32 v0, v2 513; GFX9X-NEXT: s_setpc_b64 s[30:31] 514; 515; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: 516; GFX12: ; %bb.0: 517; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 518; GFX12-NEXT: s_wait_expcnt 0x0 519; GFX12-NEXT: s_wait_samplecnt 0x0 520; GFX12-NEXT: s_wait_bvhcnt 0x0 521; GFX12-NEXT: s_wait_kmcnt 0x0 522; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:2 523; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 524; GFX12-NEXT: v_mov_b32_e32 v0, v2 525; GFX12-NEXT: s_setpc_b64 s[30:31] 526 %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2) 527 ret i32 %ret 528} 529 530define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) { 531; GFX9X-LABEL: test_cvt_sr_fp8_f32_byte3: 532; GFX9X: ; %bb.0: 533; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 534; GFX9X-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] 535; GFX9X-NEXT: s_nop 0 536; GFX9X-NEXT: v_mov_b32_e32 v0, v2 537; GFX9X-NEXT: s_setpc_b64 s[30:31] 538; 539; GFX12-LABEL: test_cvt_sr_fp8_f32_byte3: 540; GFX12: ; %bb.0: 541; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 542; GFX12-NEXT: s_wait_expcnt 0x0 543; GFX12-NEXT: s_wait_samplecnt 0x0 544; GFX12-NEXT: s_wait_bvhcnt 0x0 545; GFX12-NEXT: s_wait_kmcnt 0x0 546; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:3 547; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 548; GFX12-NEXT: v_mov_b32_e32 v0, v2 549; GFX12-NEXT: s_setpc_b64 s[30:31] 550 %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3) 551 ret i32 %ret 552} 553 554define float @test_sext_cvt_f32_fp8(i16 %a) { 555; GFX9X-LABEL: test_sext_cvt_f32_fp8: 556; GFX9X: ; %bb.0: 557; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 558; GFX9X-NEXT: v_bfe_i32 v0, v0, 0, 16 559; GFX9X-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 560; GFX9X-NEXT: s_setpc_b64 s[30:31] 561; 562; GFX12-LABEL: test_sext_cvt_f32_fp8: 563; GFX12: ; %bb.0: 564; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 565; GFX12-NEXT: s_wait_expcnt 0x0 566; GFX12-NEXT: s_wait_samplecnt 0x0 567; GFX12-NEXT: s_wait_bvhcnt 0x0 568; GFX12-NEXT: s_wait_kmcnt 0x0 569; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 570; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 571; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1 572; GFX12-NEXT: s_setpc_b64 s[30:31] 573 %a.sext = sext i16 %a to i32 574 %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a.sext, i32 1) 575 ret float %ret 576} 577 578define float @test_sext_cvt_f32_bf8(i16 %a) { 579; GFX9X-LABEL: test_sext_cvt_f32_bf8: 580; GFX9X: ; %bb.0: 581; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 582; GFX9X-NEXT: v_bfe_i32 v0, v0, 0, 16 583; GFX9X-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 584; GFX9X-NEXT: s_setpc_b64 s[30:31] 585; 586; GFX12-LABEL: test_sext_cvt_f32_bf8: 587; GFX12: ; %bb.0: 588; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 589; GFX12-NEXT: s_wait_expcnt 0x0 590; GFX12-NEXT: s_wait_samplecnt 0x0 591; GFX12-NEXT: s_wait_bvhcnt 0x0 592; GFX12-NEXT: s_wait_kmcnt 0x0 593; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 594; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 595; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1 596; GFX12-NEXT: s_setpc_b64 s[30:31] 597 %a.sext = sext i16 %a to i32 598 %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a.sext, i32 1) 599 ret float %ret 600} 601 602define <2 x float> @test_sext_cvt_pk_f32_bf8_word1(i16 %a) { 603; GFX9X-LABEL: test_sext_cvt_pk_f32_bf8_word1: 604; GFX9X: ; %bb.0: 605; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 606; GFX9X-NEXT: v_bfe_i32 v0, v0, 0, 16 607; GFX9X-NEXT: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 608; GFX9X-NEXT: s_setpc_b64 s[30:31] 609; 610; GFX12-LABEL: test_sext_cvt_pk_f32_bf8_word1: 611; GFX12: ; %bb.0: 612; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 613; GFX12-NEXT: s_wait_expcnt 0x0 614; GFX12-NEXT: s_wait_samplecnt 0x0 615; GFX12-NEXT: s_wait_bvhcnt 0x0 616; GFX12-NEXT: s_wait_kmcnt 0x0 617; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 618; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 619; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] 620; GFX12-NEXT: s_setpc_b64 s[30:31] 621 %a.sext = sext i16 %a to i32 622 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a.sext, i1 true) 623 ret <2 x float> %ret 624} 625 626define <2 x float> @test_sext_cvt_pk_f32_fp8_word0(i16 %a) { 627; GFX9X-LABEL: test_sext_cvt_pk_f32_fp8_word0: 628; GFX9X: ; %bb.0: 629; GFX9X-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 630; GFX9X-NEXT: v_bfe_i32 v0, v0, 0, 16 631; GFX9X-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 632; GFX9X-NEXT: s_setpc_b64 s[30:31] 633; 634; GFX12-LABEL: test_sext_cvt_pk_f32_fp8_word0: 635; GFX12: ; %bb.0: 636; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 637; GFX12-NEXT: s_wait_expcnt 0x0 638; GFX12-NEXT: s_wait_samplecnt 0x0 639; GFX12-NEXT: s_wait_bvhcnt 0x0 640; GFX12-NEXT: s_wait_kmcnt 0x0 641; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 642; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 643; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 644; GFX12-NEXT: s_setpc_b64 s[30:31] 645 %a.sext = sext i16 %a to i32 646 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a.sext, i1 false) 647 ret <2 x float> %ret 648} 649