1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 4 5define float @test_amdgcn_dot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) { 6; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8: 7; GFX12: ; %bb.0: ; %entry 8; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9; GFX12-NEXT: s_wait_expcnt 0x0 10; GFX12-NEXT: s_wait_samplecnt 0x0 11; GFX12-NEXT: s_wait_bvhcnt 0x0 12; GFX12-NEXT: s_wait_kmcnt 0x0 13; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 14; GFX12-NEXT: s_setpc_b64 s[30:31] 15entry: 16 %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c) 17 ret float %ret 18} 19 20define float @test_amdgcn_dot4_f32_fp8_bf8_fabs(i32 %a, i32 %b, float %c) { 21; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fabs: 22; GFX12: ; %bb.0: ; %entry 23; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 24; GFX12-NEXT: s_wait_expcnt 0x0 25; GFX12-NEXT: s_wait_samplecnt 0x0 26; GFX12-NEXT: s_wait_bvhcnt 0x0 27; GFX12-NEXT: s_wait_kmcnt 0x0 28; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1] 29; GFX12-NEXT: s_setpc_b64 s[30:31] 30entry: 31 %fabs.c = call float @llvm.fabs.f32(float %c) 32 %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.c) 33 ret float %ret 34} 35 36define float @test_amdgcn_dot4_f32_fp8_bf8_fneg(i32 %a, i32 %b, float %c) { 37; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fneg: 38; GFX12: ; %bb.0: ; %entry 39; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 40; GFX12-NEXT: s_wait_expcnt 0x0 41; GFX12-NEXT: s_wait_samplecnt 0x0 42; GFX12-NEXT: s_wait_bvhcnt 0x0 43; GFX12-NEXT: s_wait_kmcnt 0x0 44; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] 45; GFX12-NEXT: s_setpc_b64 s[30:31] 46entry: 47 %fneg.c = fneg float %c 48 %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.c) 49 ret float %ret 50} 51 52define float @test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) { 53; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg: 54; GFX12: ; %bb.0: ; %entry 55; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 56; GFX12-NEXT: s_wait_expcnt 0x0 57; GFX12-NEXT: s_wait_samplecnt 0x0 58; GFX12-NEXT: s_wait_bvhcnt 0x0 59; GFX12-NEXT: s_wait_kmcnt 0x0 60; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1] 61; GFX12-NEXT: s_setpc_b64 s[30:31] 62entry: 63 %fneg.c = fneg float %c 64 %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c) 65 %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.fneg.c) 66 ret float %ret 67} 68 69define float @test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) { 70; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs: 71; GFX12: ; %bb.0: ; %entry 72; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 73; GFX12-NEXT: s_wait_expcnt 0x0 74; GFX12-NEXT: s_wait_samplecnt 0x0 75; GFX12-NEXT: s_wait_bvhcnt 0x0 76; GFX12-NEXT: s_wait_kmcnt 0x0 77; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] 78; GFX12-NEXT: s_setpc_b64 s[30:31] 79entry: 80 %fabs.c = call float @llvm.fabs.f32(float %c) 81 %fneg.fabs.c = fneg float %fabs.c 82 %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.fabs.c) 83 ret float %ret 84} 85 86define float @test_amdgcn_dot4_f32_bf8_fp8(i32 %a, i32 %b, float %c) { 87; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8: 88; GFX12: ; %bb.0: ; %entry 89; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 90; GFX12-NEXT: s_wait_expcnt 0x0 91; GFX12-NEXT: s_wait_samplecnt 0x0 92; GFX12-NEXT: s_wait_bvhcnt 0x0 93; GFX12-NEXT: s_wait_kmcnt 0x0 94; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 95; GFX12-NEXT: s_setpc_b64 s[30:31] 96entry: 97 %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c) 98 ret float %ret 99} 100 101define float @test_amdgcn_dot4_f32_bf8_fp8_fabs(i32 %a, i32 %b, float %c) { 102; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fabs: 103; GFX12: ; %bb.0: ; %entry 104; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 105; GFX12-NEXT: s_wait_expcnt 0x0 106; GFX12-NEXT: s_wait_samplecnt 0x0 107; GFX12-NEXT: s_wait_bvhcnt 0x0 108; GFX12-NEXT: s_wait_kmcnt 0x0 109; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1] 110; GFX12-NEXT: s_setpc_b64 s[30:31] 111entry: 112 %fabs.c = call float @llvm.fabs.f32(float %c) 113 %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.c) 114 ret float %ret 115} 116 117define float @test_amdgcn_dot4_f32_bf8_fp8_fneg(i32 %a, i32 %b, float %c) { 118; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fneg: 119; GFX12: ; %bb.0: ; %entry 120; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 121; GFX12-NEXT: s_wait_expcnt 0x0 122; GFX12-NEXT: s_wait_samplecnt 0x0 123; GFX12-NEXT: s_wait_bvhcnt 0x0 124; GFX12-NEXT: s_wait_kmcnt 0x0 125; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] 126; GFX12-NEXT: s_setpc_b64 s[30:31] 127entry: 128 %fneg.c = fneg float %c 129 %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.c) 130 ret float %ret 131} 132 133define float @test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) { 134; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg: 135; GFX12: ; %bb.0: ; %entry 136; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 137; GFX12-NEXT: s_wait_expcnt 0x0 138; GFX12-NEXT: s_wait_samplecnt 0x0 139; GFX12-NEXT: s_wait_bvhcnt 0x0 140; GFX12-NEXT: s_wait_kmcnt 0x0 141; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1] 142; GFX12-NEXT: s_setpc_b64 s[30:31] 143entry: 144 %fneg.c = fneg float %c 145 %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c) 146 %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.fneg.c) 147 ret float %ret 148} 149 150define float @test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) { 151; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs: 152; GFX12: ; %bb.0: ; %entry 153; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 154; GFX12-NEXT: s_wait_expcnt 0x0 155; GFX12-NEXT: s_wait_samplecnt 0x0 156; GFX12-NEXT: s_wait_bvhcnt 0x0 157; GFX12-NEXT: s_wait_kmcnt 0x0 158; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] 159; GFX12-NEXT: s_setpc_b64 s[30:31] 160entry: 161 %fabs.c = call float @llvm.fabs.f32(float %c) 162 %fneg.fabs.c = fneg float %fabs.c 163 %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.fabs.c) 164 ret float %ret 165} 166 167define float @test_amdgcn_dot4_f32_fp8_fp8(i32 %a, i32 %b, float %c) { 168; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8: 169; GFX12: ; %bb.0: ; %entry 170; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 171; GFX12-NEXT: s_wait_expcnt 0x0 172; GFX12-NEXT: s_wait_samplecnt 0x0 173; GFX12-NEXT: s_wait_bvhcnt 0x0 174; GFX12-NEXT: s_wait_kmcnt 0x0 175; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 176; GFX12-NEXT: s_setpc_b64 s[30:31] 177entry: 178 %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c) 179 ret float %ret 180} 181 182define float @test_amdgcn_dot4_f32_fp8_fp8_fabs(i32 %a, i32 %b, float %c) { 183; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fabs: 184; GFX12: ; %bb.0: ; %entry 185; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 186; GFX12-NEXT: s_wait_expcnt 0x0 187; GFX12-NEXT: s_wait_samplecnt 0x0 188; GFX12-NEXT: s_wait_bvhcnt 0x0 189; GFX12-NEXT: s_wait_kmcnt 0x0 190; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1] 191; GFX12-NEXT: s_setpc_b64 s[30:31] 192entry: 193 %fabs.c = call float @llvm.fabs.f32(float %c) 194 %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.c) 195 ret float %ret 196} 197 198define float @test_amdgcn_dot4_f32_fp8_fp8_fneg(i32 %a, i32 %b, float %c) { 199; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fneg: 200; GFX12: ; %bb.0: ; %entry 201; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 202; GFX12-NEXT: s_wait_expcnt 0x0 203; GFX12-NEXT: s_wait_samplecnt 0x0 204; GFX12-NEXT: s_wait_bvhcnt 0x0 205; GFX12-NEXT: s_wait_kmcnt 0x0 206; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] 207; GFX12-NEXT: s_setpc_b64 s[30:31] 208entry: 209 %fneg.c = fneg float %c 210 %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.c) 211 ret float %ret 212} 213 214define float @test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) { 215; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg: 216; GFX12: ; %bb.0: ; %entry 217; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 218; GFX12-NEXT: s_wait_expcnt 0x0 219; GFX12-NEXT: s_wait_samplecnt 0x0 220; GFX12-NEXT: s_wait_bvhcnt 0x0 221; GFX12-NEXT: s_wait_kmcnt 0x0 222; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1] 223; GFX12-NEXT: s_setpc_b64 s[30:31] 224entry: 225 %fneg.c = fneg float %c 226 %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c) 227 %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.fneg.c) 228 ret float %ret 229} 230 231define float @test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) { 232; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs: 233; GFX12: ; %bb.0: ; %entry 234; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 235; GFX12-NEXT: s_wait_expcnt 0x0 236; GFX12-NEXT: s_wait_samplecnt 0x0 237; GFX12-NEXT: s_wait_bvhcnt 0x0 238; GFX12-NEXT: s_wait_kmcnt 0x0 239; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] 240; GFX12-NEXT: s_setpc_b64 s[30:31] 241entry: 242 %fabs.c = call float @llvm.fabs.f32(float %c) 243 %fneg.fabs.c = fneg float %fabs.c 244 %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.fabs.c) 245 ret float %ret 246} 247 248define float @test_amdgcn_dot4_f32_bf8_bf8(i32 %a, i32 %b, float %c) { 249; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8: 250; GFX12: ; %bb.0: ; %entry 251; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 252; GFX12-NEXT: s_wait_expcnt 0x0 253; GFX12-NEXT: s_wait_samplecnt 0x0 254; GFX12-NEXT: s_wait_bvhcnt 0x0 255; GFX12-NEXT: s_wait_kmcnt 0x0 256; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 257; GFX12-NEXT: s_setpc_b64 s[30:31] 258entry: 259 %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c) 260 ret float %ret 261} 262 263define float @test_amdgcn_dot4_f32_bf8_bf8_fabs(i32 %a, i32 %b, float %c) { 264; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fabs: 265; GFX12: ; %bb.0: ; %entry 266; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 267; GFX12-NEXT: s_wait_expcnt 0x0 268; GFX12-NEXT: s_wait_samplecnt 0x0 269; GFX12-NEXT: s_wait_bvhcnt 0x0 270; GFX12-NEXT: s_wait_kmcnt 0x0 271; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1] 272; GFX12-NEXT: s_setpc_b64 s[30:31] 273entry: 274 %fabs.c = call float @llvm.fabs.f32(float %c) 275 %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.c) 276 ret float %ret 277} 278 279define float @test_amdgcn_dot4_f32_bf8_bf8_fneg(i32 %a, i32 %b, float %c) { 280; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fneg: 281; GFX12: ; %bb.0: ; %entry 282; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 283; GFX12-NEXT: s_wait_expcnt 0x0 284; GFX12-NEXT: s_wait_samplecnt 0x0 285; GFX12-NEXT: s_wait_bvhcnt 0x0 286; GFX12-NEXT: s_wait_kmcnt 0x0 287; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] 288; GFX12-NEXT: s_setpc_b64 s[30:31] 289entry: 290 %fneg.c = fneg float %c 291 %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.c) 292 ret float %ret 293} 294 295define float @test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) { 296; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg: 297; GFX12: ; %bb.0: ; %entry 298; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 299; GFX12-NEXT: s_wait_expcnt 0x0 300; GFX12-NEXT: s_wait_samplecnt 0x0 301; GFX12-NEXT: s_wait_bvhcnt 0x0 302; GFX12-NEXT: s_wait_kmcnt 0x0 303; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1] 304; GFX12-NEXT: s_setpc_b64 s[30:31] 305entry: 306 %fneg.c = fneg float %c 307 %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c) 308 %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.fneg.c) 309 ret float %ret 310} 311 312define float @test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) { 313; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs: 314; GFX12: ; %bb.0: ; %entry 315; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 316; GFX12-NEXT: s_wait_expcnt 0x0 317; GFX12-NEXT: s_wait_samplecnt 0x0 318; GFX12-NEXT: s_wait_bvhcnt 0x0 319; GFX12-NEXT: s_wait_kmcnt 0x0 320; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] 321; GFX12-NEXT: s_setpc_b64 s[30:31] 322entry: 323 %fabs.c = call float @llvm.fabs.f32(float %c) 324 %fneg.fabs.c = fneg float %fabs.c 325 %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.fabs.c) 326 ret float %ret 327} 328 329declare float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c) 330declare float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c) 331declare float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c) 332declare float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c) 333 334declare float @llvm.fabs.f32(float %a) 335 336