1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s 4 5declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) 6declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) 7 8; -------------------------------------------------------------------- 9; llvm.amdgcn.mfma.f32.16x16x32.f16 10; -------------------------------------------------------------------- 11 12define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { 13; GCN-LABEL: test_mfma_f32_16x16x32_f16: 14; GCN: ; %bb.0: 15; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16; GCN-NEXT: v_accvgpr_write_b32 a0, v8 17; GCN-NEXT: v_accvgpr_write_b32 a1, v9 18; GCN-NEXT: v_accvgpr_write_b32 a2, v10 19; GCN-NEXT: v_accvgpr_write_b32 a3, v11 20; GCN-NEXT: s_nop 1 21; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] 22; GCN-NEXT: s_nop 6 23; GCN-NEXT: v_accvgpr_read_b32 v0, a0 24; GCN-NEXT: v_accvgpr_read_b32 v1, a1 25; GCN-NEXT: v_accvgpr_read_b32 v2, a2 26; GCN-NEXT: v_accvgpr_read_b32 v3, a3 27; GCN-NEXT: s_setpc_b64 s[30:31] 28 %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) 29 ret <4 x float> %result 30} 31 32define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { 33; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags: 34; GCN: ; %bb.0: 35; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36; GCN-NEXT: v_accvgpr_write_b32 a0, v8 37; GCN-NEXT: v_accvgpr_write_b32 a1, v9 38; GCN-NEXT: v_accvgpr_write_b32 a2, v10 39; GCN-NEXT: v_accvgpr_write_b32 a3, v11 40; GCN-NEXT: s_nop 1 41; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 42; GCN-NEXT: s_nop 6 43; GCN-NEXT: v_accvgpr_read_b32 v0, a0 44; GCN-NEXT: v_accvgpr_read_b32 v1, a1 45; GCN-NEXT: v_accvgpr_read_b32 v2, a2 46; GCN-NEXT: v_accvgpr_read_b32 v3, a3 47; GCN-NEXT: s_setpc_b64 s[30:31] 48 %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) 49 ret <4 x float> %result 50} 51 52define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 { 53; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: 54; SDAG: ; %bb.0: 55; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 56; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 57; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 58; SDAG-NEXT: v_mov_b32_e32 v8, 0 59; SDAG-NEXT: s_waitcnt lgkmcnt(0) 60; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 61; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 62; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] 63; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 64; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] 65; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 66; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 67; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 68; SDAG-NEXT: s_nop 1 69; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] 70; SDAG-NEXT: s_nop 6 71; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] 72; SDAG-NEXT: s_endpgm 73; 74; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: 75; GISEL: ; %bb.0: 76; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 77; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 78; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 79; GISEL-NEXT: s_waitcnt lgkmcnt(0) 80; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 81; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 82; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] 83; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 84; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] 85; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 86; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 87; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 88; GISEL-NEXT: s_nop 1 89; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] 90; GISEL-NEXT: v_mov_b32_e32 v0, 0 91; GISEL-NEXT: s_nop 5 92; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] 93; GISEL-NEXT: s_endpgm 94 %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) 95 store <4 x float> %result, ptr addrspace(1) %out 96 ret void 97} 98 99define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 { 100; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: 101; SDAG: ; %bb.0: 102; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 103; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 104; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 105; SDAG-NEXT: v_mov_b32_e32 v8, 0 106; SDAG-NEXT: s_waitcnt lgkmcnt(0) 107; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 108; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 109; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] 110; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 111; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] 112; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 113; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 114; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 115; SDAG-NEXT: s_nop 1 116; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 117; SDAG-NEXT: s_nop 6 118; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] 119; SDAG-NEXT: s_endpgm 120; 121; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: 122; GISEL: ; %bb.0: 123; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 124; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 125; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 126; GISEL-NEXT: s_waitcnt lgkmcnt(0) 127; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 128; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 129; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] 130; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 131; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] 132; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 133; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 134; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 135; GISEL-NEXT: s_nop 1 136; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 137; GISEL-NEXT: v_mov_b32_e32 v0, 0 138; GISEL-NEXT: s_nop 5 139; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] 140; GISEL-NEXT: s_endpgm 141 %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) 142 store <4 x float> %result, ptr addrspace(1) %out 143 ret void 144} 145 146; -------------------------------------------------------------------- 147; llvm.amdgcn.mfma.f32.32x32x16.f16 148; -------------------------------------------------------------------- 149 150define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 { 151; SDAG-LABEL: test_mfma_f32_32x32x16_f16: 152; SDAG: ; %bb.0: 153; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 154; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 155; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48 156; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32 157; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16 158; SDAG-NEXT: s_waitcnt lgkmcnt(0) 159; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 160; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 161; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 162; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 163; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 164; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 165; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 166; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 167; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 168; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 169; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 170; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 171; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 172; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 173; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 174; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 175; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 176; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 177; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 178; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 179; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0 180; SDAG-NEXT: v_mov_b32_e32 v8, s16 181; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] 182; SDAG-NEXT: v_mov_b32_e32 v0, s20 183; SDAG-NEXT: v_mov_b32_e32 v1, s21 184; SDAG-NEXT: v_mov_b32_e32 v2, s22 185; SDAG-NEXT: v_mov_b32_e32 v3, s23 186; SDAG-NEXT: v_mov_b32_e32 v9, s17 187; SDAG-NEXT: v_mov_b32_e32 v10, s18 188; SDAG-NEXT: v_mov_b32_e32 v11, s19 189; SDAG-NEXT: s_nop 3 190; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 191; SDAG-NEXT: s_waitcnt vmcnt(0) 192; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 193; SDAG-NEXT: s_waitcnt vmcnt(0) 194; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 195; SDAG-NEXT: s_waitcnt vmcnt(0) 196; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 197; SDAG-NEXT: s_waitcnt vmcnt(0) 198; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 199; SDAG-NEXT: s_waitcnt vmcnt(0) 200; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 201; SDAG-NEXT: s_waitcnt vmcnt(0) 202; SDAG-NEXT: s_nop 0 203; SDAG-NEXT: v_mov_b32_e32 v0, s8 204; SDAG-NEXT: v_mov_b32_e32 v1, s9 205; SDAG-NEXT: v_mov_b32_e32 v2, s10 206; SDAG-NEXT: v_mov_b32_e32 v3, s11 207; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 208; SDAG-NEXT: s_waitcnt vmcnt(0) 209; SDAG-NEXT: s_nop 0 210; SDAG-NEXT: v_mov_b32_e32 v0, s12 211; SDAG-NEXT: v_mov_b32_e32 v1, s13 212; SDAG-NEXT: v_mov_b32_e32 v2, s14 213; SDAG-NEXT: v_mov_b32_e32 v3, s15 214; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 215; SDAG-NEXT: s_waitcnt vmcnt(0) 216; SDAG-NEXT: s_endpgm 217; 218; GISEL-LABEL: test_mfma_f32_32x32x16_f16: 219; GISEL: ; %bb.0: 220; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 221; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 222; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0 223; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48 224; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16 225; GISEL-NEXT: s_waitcnt lgkmcnt(0) 226; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 227; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 228; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 229; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 230; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 231; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 232; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 233; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 234; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 235; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 236; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 237; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 238; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 239; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 240; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 241; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 242; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 243; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 244; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 245; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 246; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] 247; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] 248; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] 249; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] 250; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] 251; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] 252; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32 253; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] 254; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] 255; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] 256; GISEL-NEXT: s_nop 3 257; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 258; GISEL-NEXT: s_waitcnt vmcnt(0) 259; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 260; GISEL-NEXT: s_waitcnt vmcnt(0) 261; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1 262; GISEL-NEXT: s_waitcnt vmcnt(0) 263; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1 264; GISEL-NEXT: s_waitcnt vmcnt(0) 265; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 266; GISEL-NEXT: s_waitcnt vmcnt(0) 267; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 268; GISEL-NEXT: s_waitcnt vmcnt(0) 269; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 270; GISEL-NEXT: s_waitcnt vmcnt(0) 271; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 272; GISEL-NEXT: s_waitcnt vmcnt(0) 273; GISEL-NEXT: s_endpgm 274 %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) 275 store volatile <16 x float> %result, ptr addrspace(1) null 276 store volatile <16 x float> %arg2, ptr addrspace(1) null 277 ret void 278} 279 280define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 { 281; SDAG-LABEL: test_mfma_f32_32x32x16_f16__flags: 282; SDAG: ; %bb.0: 283; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 284; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 285; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48 286; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32 287; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16 288; SDAG-NEXT: s_waitcnt lgkmcnt(0) 289; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 290; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 291; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 292; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 293; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 294; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 295; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 296; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 297; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 298; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 299; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 300; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 301; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 302; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 303; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 304; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 305; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 306; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 307; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 308; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 309; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0 310; SDAG-NEXT: v_mov_b32_e32 v8, s16 311; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 312; SDAG-NEXT: v_mov_b32_e32 v0, s20 313; SDAG-NEXT: v_mov_b32_e32 v1, s21 314; SDAG-NEXT: v_mov_b32_e32 v2, s22 315; SDAG-NEXT: v_mov_b32_e32 v3, s23 316; SDAG-NEXT: v_mov_b32_e32 v9, s17 317; SDAG-NEXT: v_mov_b32_e32 v10, s18 318; SDAG-NEXT: v_mov_b32_e32 v11, s19 319; SDAG-NEXT: s_nop 3 320; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 321; SDAG-NEXT: s_waitcnt vmcnt(0) 322; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 323; SDAG-NEXT: s_waitcnt vmcnt(0) 324; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 325; SDAG-NEXT: s_waitcnt vmcnt(0) 326; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 327; SDAG-NEXT: s_waitcnt vmcnt(0) 328; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 329; SDAG-NEXT: s_waitcnt vmcnt(0) 330; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 331; SDAG-NEXT: s_waitcnt vmcnt(0) 332; SDAG-NEXT: s_nop 0 333; SDAG-NEXT: v_mov_b32_e32 v0, s8 334; SDAG-NEXT: v_mov_b32_e32 v1, s9 335; SDAG-NEXT: v_mov_b32_e32 v2, s10 336; SDAG-NEXT: v_mov_b32_e32 v3, s11 337; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 338; SDAG-NEXT: s_waitcnt vmcnt(0) 339; SDAG-NEXT: s_nop 0 340; SDAG-NEXT: v_mov_b32_e32 v0, s12 341; SDAG-NEXT: v_mov_b32_e32 v1, s13 342; SDAG-NEXT: v_mov_b32_e32 v2, s14 343; SDAG-NEXT: v_mov_b32_e32 v3, s15 344; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 345; SDAG-NEXT: s_waitcnt vmcnt(0) 346; SDAG-NEXT: s_endpgm 347; 348; GISEL-LABEL: test_mfma_f32_32x32x16_f16__flags: 349; GISEL: ; %bb.0: 350; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 351; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 352; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0 353; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48 354; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16 355; GISEL-NEXT: s_waitcnt lgkmcnt(0) 356; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 357; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 358; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 359; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 360; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 361; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 362; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 363; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 364; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 365; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 366; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 367; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 368; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 369; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 370; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 371; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 372; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 373; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 374; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 375; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 376; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] 377; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] 378; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 379; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] 380; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] 381; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] 382; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32 383; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] 384; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] 385; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] 386; GISEL-NEXT: s_nop 3 387; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 388; GISEL-NEXT: s_waitcnt vmcnt(0) 389; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 390; GISEL-NEXT: s_waitcnt vmcnt(0) 391; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1 392; GISEL-NEXT: s_waitcnt vmcnt(0) 393; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1 394; GISEL-NEXT: s_waitcnt vmcnt(0) 395; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 396; GISEL-NEXT: s_waitcnt vmcnt(0) 397; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 398; GISEL-NEXT: s_waitcnt vmcnt(0) 399; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 400; GISEL-NEXT: s_waitcnt vmcnt(0) 401; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 402; GISEL-NEXT: s_waitcnt vmcnt(0) 403; GISEL-NEXT: s_endpgm 404 %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1) 405 store volatile <16 x float> %result, ptr addrspace(1) null 406 store volatile <16 x float> %arg2, ptr addrspace(1) null 407 ret void 408} 409 410define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { 411; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac: 412; GCN: ; %bb.0: 413; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 414; GCN-NEXT: v_accvgpr_write_b32 a0, v8 415; GCN-NEXT: v_accvgpr_write_b32 a1, v9 416; GCN-NEXT: v_accvgpr_write_b32 a2, v10 417; GCN-NEXT: v_accvgpr_write_b32 a3, v11 418; GCN-NEXT: v_accvgpr_write_b32 a4, v12 419; GCN-NEXT: v_accvgpr_write_b32 a5, v13 420; GCN-NEXT: v_accvgpr_write_b32 a6, v14 421; GCN-NEXT: v_accvgpr_write_b32 a7, v15 422; GCN-NEXT: v_accvgpr_write_b32 a8, v16 423; GCN-NEXT: v_accvgpr_write_b32 a9, v17 424; GCN-NEXT: v_accvgpr_write_b32 a10, v18 425; GCN-NEXT: v_accvgpr_write_b32 a11, v19 426; GCN-NEXT: v_accvgpr_write_b32 a12, v20 427; GCN-NEXT: v_accvgpr_write_b32 a13, v21 428; GCN-NEXT: v_accvgpr_write_b32 a14, v22 429; GCN-NEXT: v_accvgpr_write_b32 a15, v23 430; GCN-NEXT: s_nop 1 431; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] 432; GCN-NEXT: s_nop 7 433; GCN-NEXT: s_nop 2 434; GCN-NEXT: v_accvgpr_read_b32 v0, a0 435; GCN-NEXT: v_accvgpr_read_b32 v1, a1 436; GCN-NEXT: v_accvgpr_read_b32 v2, a2 437; GCN-NEXT: v_accvgpr_read_b32 v3, a3 438; GCN-NEXT: v_accvgpr_read_b32 v4, a4 439; GCN-NEXT: v_accvgpr_read_b32 v5, a5 440; GCN-NEXT: v_accvgpr_read_b32 v6, a6 441; GCN-NEXT: v_accvgpr_read_b32 v7, a7 442; GCN-NEXT: v_accvgpr_read_b32 v8, a8 443; GCN-NEXT: v_accvgpr_read_b32 v9, a9 444; GCN-NEXT: v_accvgpr_read_b32 v10, a10 445; GCN-NEXT: v_accvgpr_read_b32 v11, a11 446; GCN-NEXT: v_accvgpr_read_b32 v12, a12 447; GCN-NEXT: v_accvgpr_read_b32 v13, a13 448; GCN-NEXT: v_accvgpr_read_b32 v14, a14 449; GCN-NEXT: v_accvgpr_read_b32 v15, a15 450; GCN-NEXT: s_setpc_b64 s[30:31] 451 %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) 452 ret <16 x float> %result 453} 454 455define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { 456; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: 457; GCN: ; %bb.0: 458; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 459; GCN-NEXT: v_accvgpr_write_b32 a0, v8 460; GCN-NEXT: v_accvgpr_write_b32 a1, v9 461; GCN-NEXT: v_accvgpr_write_b32 a2, v10 462; GCN-NEXT: v_accvgpr_write_b32 a3, v11 463; GCN-NEXT: v_accvgpr_write_b32 a4, v12 464; GCN-NEXT: v_accvgpr_write_b32 a5, v13 465; GCN-NEXT: v_accvgpr_write_b32 a6, v14 466; GCN-NEXT: v_accvgpr_write_b32 a7, v15 467; GCN-NEXT: v_accvgpr_write_b32 a8, v16 468; GCN-NEXT: v_accvgpr_write_b32 a9, v17 469; GCN-NEXT: v_accvgpr_write_b32 a10, v18 470; GCN-NEXT: v_accvgpr_write_b32 a11, v19 471; GCN-NEXT: v_accvgpr_write_b32 a12, v20 472; GCN-NEXT: v_accvgpr_write_b32 a13, v21 473; GCN-NEXT: v_accvgpr_write_b32 a14, v22 474; GCN-NEXT: v_accvgpr_write_b32 a15, v23 475; GCN-NEXT: s_nop 1 476; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 477; GCN-NEXT: s_nop 7 478; GCN-NEXT: s_nop 2 479; GCN-NEXT: v_accvgpr_read_b32 v0, a0 480; GCN-NEXT: v_accvgpr_read_b32 v1, a1 481; GCN-NEXT: v_accvgpr_read_b32 v2, a2 482; GCN-NEXT: v_accvgpr_read_b32 v3, a3 483; GCN-NEXT: v_accvgpr_read_b32 v4, a4 484; GCN-NEXT: v_accvgpr_read_b32 v5, a5 485; GCN-NEXT: v_accvgpr_read_b32 v6, a6 486; GCN-NEXT: v_accvgpr_read_b32 v7, a7 487; GCN-NEXT: v_accvgpr_read_b32 v8, a8 488; GCN-NEXT: v_accvgpr_read_b32 v9, a9 489; GCN-NEXT: v_accvgpr_read_b32 v10, a10 490; GCN-NEXT: v_accvgpr_read_b32 v11, a11 491; GCN-NEXT: v_accvgpr_read_b32 v12, a12 492; GCN-NEXT: v_accvgpr_read_b32 v13, a13 493; GCN-NEXT: v_accvgpr_read_b32 v14, a14 494; GCN-NEXT: v_accvgpr_read_b32 v15, a15 495; GCN-NEXT: s_setpc_b64 s[30:31] 496 %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1) 497 ret <16 x float> %result 498} 499 500define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 { 501; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: 502; SDAG: ; %bb.0: 503; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 504; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 505; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 506; SDAG-NEXT: v_mov_b32_e32 v12, 0 507; SDAG-NEXT: s_waitcnt lgkmcnt(0) 508; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 509; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 510; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 511; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 512; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 513; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 514; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 515; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 516; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 517; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 518; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 519; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 520; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 521; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 522; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 523; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 524; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 525; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 526; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 527; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 528; SDAG-NEXT: v_mov_b32_e32 v8, s20 529; SDAG-NEXT: v_mov_b32_e32 v9, s21 530; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] 531; SDAG-NEXT: v_mov_b32_e32 v10, s22 532; SDAG-NEXT: v_mov_b32_e32 v11, s23 533; SDAG-NEXT: v_mov_b32_e32 v0, s16 534; SDAG-NEXT: v_mov_b32_e32 v1, s17 535; SDAG-NEXT: v_mov_b32_e32 v2, s18 536; SDAG-NEXT: v_mov_b32_e32 v3, s19 537; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 538; SDAG-NEXT: s_waitcnt vmcnt(0) 539; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 540; SDAG-NEXT: s_waitcnt vmcnt(0) 541; SDAG-NEXT: s_nop 0 542; SDAG-NEXT: v_mov_b32_e32 v0, s12 543; SDAG-NEXT: v_mov_b32_e32 v1, s13 544; SDAG-NEXT: v_mov_b32_e32 v2, s14 545; SDAG-NEXT: v_mov_b32_e32 v3, s15 546; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 547; SDAG-NEXT: s_waitcnt vmcnt(0) 548; SDAG-NEXT: s_nop 0 549; SDAG-NEXT: v_mov_b32_e32 v0, s8 550; SDAG-NEXT: v_mov_b32_e32 v1, s9 551; SDAG-NEXT: v_mov_b32_e32 v2, s10 552; SDAG-NEXT: v_mov_b32_e32 v3, s11 553; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 554; SDAG-NEXT: s_waitcnt vmcnt(0) 555; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 556; SDAG-NEXT: s_waitcnt vmcnt(0) 557; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 558; SDAG-NEXT: s_waitcnt vmcnt(0) 559; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 560; SDAG-NEXT: s_waitcnt vmcnt(0) 561; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 562; SDAG-NEXT: s_waitcnt vmcnt(0) 563; SDAG-NEXT: s_endpgm 564; 565; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: 566; GISEL: ; %bb.0: 567; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 568; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 569; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 570; GISEL-NEXT: v_mov_b32_e32 v24, 0 571; GISEL-NEXT: s_waitcnt lgkmcnt(0) 572; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 573; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 574; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 575; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 576; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 577; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 578; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 579; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 580; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 581; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 582; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 583; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 584; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 585; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 586; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 587; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 588; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 589; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 590; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 591; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 592; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] 593; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] 594; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] 595; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] 596; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] 597; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] 598; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] 599; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] 600; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] 601; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 602; GISEL-NEXT: s_waitcnt vmcnt(0) 603; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 604; GISEL-NEXT: s_waitcnt vmcnt(0) 605; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 606; GISEL-NEXT: s_waitcnt vmcnt(0) 607; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 608; GISEL-NEXT: s_waitcnt vmcnt(0) 609; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 610; GISEL-NEXT: s_waitcnt vmcnt(0) 611; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 612; GISEL-NEXT: s_waitcnt vmcnt(0) 613; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 614; GISEL-NEXT: s_waitcnt vmcnt(0) 615; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 616; GISEL-NEXT: s_waitcnt vmcnt(0) 617; GISEL-NEXT: s_endpgm 618 %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) 619 store volatile <16 x float> %arg2, ptr addrspace(1) %out 620 store volatile <16 x float> %result, ptr addrspace(1) %out 621 ret void 622} 623 624define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 { 625; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: 626; SDAG: ; %bb.0: 627; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 628; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 629; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 630; SDAG-NEXT: v_mov_b32_e32 v12, 0 631; SDAG-NEXT: s_waitcnt lgkmcnt(0) 632; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 633; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 634; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 635; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 636; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 637; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 638; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 639; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 640; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 641; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 642; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 643; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 644; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 645; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 646; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 647; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 648; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 649; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 650; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 651; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 652; SDAG-NEXT: v_mov_b32_e32 v8, s20 653; SDAG-NEXT: v_mov_b32_e32 v9, s21 654; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 655; SDAG-NEXT: v_mov_b32_e32 v10, s22 656; SDAG-NEXT: v_mov_b32_e32 v11, s23 657; SDAG-NEXT: v_mov_b32_e32 v0, s16 658; SDAG-NEXT: v_mov_b32_e32 v1, s17 659; SDAG-NEXT: v_mov_b32_e32 v2, s18 660; SDAG-NEXT: v_mov_b32_e32 v3, s19 661; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 662; SDAG-NEXT: s_waitcnt vmcnt(0) 663; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 664; SDAG-NEXT: s_waitcnt vmcnt(0) 665; SDAG-NEXT: s_nop 0 666; SDAG-NEXT: v_mov_b32_e32 v0, s12 667; SDAG-NEXT: v_mov_b32_e32 v1, s13 668; SDAG-NEXT: v_mov_b32_e32 v2, s14 669; SDAG-NEXT: v_mov_b32_e32 v3, s15 670; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 671; SDAG-NEXT: s_waitcnt vmcnt(0) 672; SDAG-NEXT: s_nop 0 673; SDAG-NEXT: v_mov_b32_e32 v0, s8 674; SDAG-NEXT: v_mov_b32_e32 v1, s9 675; SDAG-NEXT: v_mov_b32_e32 v2, s10 676; SDAG-NEXT: v_mov_b32_e32 v3, s11 677; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 678; SDAG-NEXT: s_waitcnt vmcnt(0) 679; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 680; SDAG-NEXT: s_waitcnt vmcnt(0) 681; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 682; SDAG-NEXT: s_waitcnt vmcnt(0) 683; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 684; SDAG-NEXT: s_waitcnt vmcnt(0) 685; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 686; SDAG-NEXT: s_waitcnt vmcnt(0) 687; SDAG-NEXT: s_endpgm 688; 689; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: 690; GISEL: ; %bb.0: 691; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 692; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 693; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 694; GISEL-NEXT: v_mov_b32_e32 v24, 0 695; GISEL-NEXT: s_waitcnt lgkmcnt(0) 696; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 697; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 698; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 699; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 700; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 701; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 702; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 703; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 704; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 705; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 706; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 707; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 708; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 709; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 710; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 711; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 712; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 713; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 714; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 715; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 716; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] 717; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] 718; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 719; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] 720; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] 721; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] 722; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] 723; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] 724; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] 725; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 726; GISEL-NEXT: s_waitcnt vmcnt(0) 727; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 728; GISEL-NEXT: s_waitcnt vmcnt(0) 729; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 730; GISEL-NEXT: s_waitcnt vmcnt(0) 731; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 732; GISEL-NEXT: s_waitcnt vmcnt(0) 733; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 734; GISEL-NEXT: s_waitcnt vmcnt(0) 735; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 736; GISEL-NEXT: s_waitcnt vmcnt(0) 737; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 738; GISEL-NEXT: s_waitcnt vmcnt(0) 739; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 740; GISEL-NEXT: s_waitcnt vmcnt(0) 741; GISEL-NEXT: s_endpgm 742 %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) 743 store volatile <16 x float> %arg2, ptr addrspace(1) %out 744 store volatile <16 x float> %result, ptr addrspace(1) %out 745 ret void 746} 747 748define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 { 749; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: 750; SDAG: ; %bb.0: 751; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 752; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 753; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 754; SDAG-NEXT: s_waitcnt lgkmcnt(0) 755; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 756; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 757; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 758; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 759; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 760; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 761; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 762; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 763; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 764; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 765; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 766; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 767; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 768; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 769; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 770; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 771; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 772; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 773; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 774; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 775; SDAG-NEXT: s_nop 1 776; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] 777; SDAG-NEXT: v_mov_b32_e32 v0, 0 778; SDAG-NEXT: s_nop 7 779; SDAG-NEXT: s_nop 1 780; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 781; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 782; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 783; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] 784; SDAG-NEXT: s_endpgm 785; 786; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: 787; GISEL: ; %bb.0: 788; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 789; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 790; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 791; GISEL-NEXT: s_waitcnt lgkmcnt(0) 792; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 793; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 794; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 795; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 796; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 797; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 798; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 799; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 800; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 801; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 802; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 803; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 804; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 805; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 806; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 807; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 808; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 809; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 810; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 811; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 812; GISEL-NEXT: s_nop 1 813; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] 814; GISEL-NEXT: v_mov_b32_e32 v0, 0 815; GISEL-NEXT: s_nop 7 816; GISEL-NEXT: s_nop 1 817; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] 818; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 819; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 820; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 821; GISEL-NEXT: s_endpgm 822 %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) 823 store <16 x float> %result, ptr addrspace(1) %out 824 ret void 825} 826 827define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 { 828; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: 829; SDAG: ; %bb.0: 830; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 831; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 832; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 833; SDAG-NEXT: s_waitcnt lgkmcnt(0) 834; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 835; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 836; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 837; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 838; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 839; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 840; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 841; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 842; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 843; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 844; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 845; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 846; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 847; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 848; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 849; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 850; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 851; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 852; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 853; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 854; SDAG-NEXT: s_nop 1 855; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 856; SDAG-NEXT: v_mov_b32_e32 v0, 0 857; SDAG-NEXT: s_nop 7 858; SDAG-NEXT: s_nop 1 859; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 860; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 861; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 862; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] 863; SDAG-NEXT: s_endpgm 864; 865; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: 866; GISEL: ; %bb.0: 867; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 868; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 869; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 870; GISEL-NEXT: s_waitcnt lgkmcnt(0) 871; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 872; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 873; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 874; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 875; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 876; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 877; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 878; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 879; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 880; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 881; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 882; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 883; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 884; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 885; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 886; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 887; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 888; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 889; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 890; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 891; GISEL-NEXT: s_nop 1 892; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 893; GISEL-NEXT: v_mov_b32_e32 v0, 0 894; GISEL-NEXT: s_nop 7 895; GISEL-NEXT: s_nop 1 896; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] 897; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 898; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 899; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 900; GISEL-NEXT: s_endpgm 901 %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1) 902 store <16 x float> %result, ptr addrspace(1) %out 903 ret void 904} 905 906; -------------------------------------------------------------------- 907; llvm.amdgcn.mfma.i32.16x16x64.i8 908; -------------------------------------------------------------------- 909 910declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg) 911 912define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) { 913; GCN-LABEL: test_mfma_i32_16x16x64_i8: 914; GCN: ; %bb.0: 915; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 916; GCN-NEXT: v_accvgpr_write_b32 a0, v8 917; GCN-NEXT: v_accvgpr_write_b32 a1, v9 918; GCN-NEXT: v_accvgpr_write_b32 a2, v10 919; GCN-NEXT: v_accvgpr_write_b32 a3, v11 920; GCN-NEXT: s_nop 1 921; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] 922; GCN-NEXT: s_nop 6 923; GCN-NEXT: v_accvgpr_read_b32 v0, a0 924; GCN-NEXT: v_accvgpr_read_b32 v1, a1 925; GCN-NEXT: v_accvgpr_read_b32 v2, a2 926; GCN-NEXT: v_accvgpr_read_b32 v3, a3 927; GCN-NEXT: s_setpc_b64 s[30:31] 928 %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0) 929 ret <4 x i32> %result 930} 931 932define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) { 933; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags: 934; GCN: ; %bb.0: 935; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 936; GCN-NEXT: v_accvgpr_write_b32 a0, v8 937; GCN-NEXT: v_accvgpr_write_b32 a1, v9 938; GCN-NEXT: v_accvgpr_write_b32 a2, v10 939; GCN-NEXT: v_accvgpr_write_b32 a3, v11 940; GCN-NEXT: s_nop 1 941; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 942; GCN-NEXT: s_nop 6 943; GCN-NEXT: v_accvgpr_read_b32 v0, a0 944; GCN-NEXT: v_accvgpr_read_b32 v1, a1 945; GCN-NEXT: v_accvgpr_read_b32 v2, a2 946; GCN-NEXT: v_accvgpr_read_b32 v3, a3 947; GCN-NEXT: s_setpc_b64 s[30:31] 948 %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 1, i32 1, i32 1) 949 ret <4 x i32> %result 950} 951 952define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspace(1) %out, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) #0 { 953; SDAG-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: 954; SDAG: ; %bb.0: 955; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 956; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 957; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 958; SDAG-NEXT: v_mov_b32_e32 v8, 0 959; SDAG-NEXT: s_waitcnt lgkmcnt(0) 960; SDAG-NEXT: v_mov_b32_e32 v0, s8 961; SDAG-NEXT: v_mov_b32_e32 v1, s9 962; SDAG-NEXT: v_mov_b32_e32 v2, s10 963; SDAG-NEXT: v_mov_b32_e32 v3, s11 964; SDAG-NEXT: v_mov_b32_e32 v4, s12 965; SDAG-NEXT: v_mov_b32_e32 v5, s13 966; SDAG-NEXT: v_mov_b32_e32 v6, s14 967; SDAG-NEXT: v_mov_b32_e32 v7, s15 968; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 969; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 970; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 971; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 972; SDAG-NEXT: s_nop 1 973; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] 974; SDAG-NEXT: s_nop 6 975; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] 976; SDAG-NEXT: s_endpgm 977; 978; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: 979; GISEL: ; %bb.0: 980; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 981; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 982; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 983; GISEL-NEXT: s_waitcnt lgkmcnt(0) 984; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 985; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 986; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] 987; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 988; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] 989; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 990; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 991; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 992; GISEL-NEXT: s_nop 1 993; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] 994; GISEL-NEXT: v_mov_b32_e32 v0, 0 995; GISEL-NEXT: s_nop 5 996; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] 997; GISEL-NEXT: s_endpgm 998 %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0) 999 store <4 x i32> %result, ptr addrspace(1) %out 1000 ret void 1001} 1002 1003define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) #0 { 1004; SDAG-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: 1005; SDAG: ; %bb.0: 1006; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 1007; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1008; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 1009; SDAG-NEXT: v_mov_b32_e32 v8, 0 1010; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1011; SDAG-NEXT: v_mov_b32_e32 v0, s8 1012; SDAG-NEXT: v_mov_b32_e32 v1, s9 1013; SDAG-NEXT: v_mov_b32_e32 v2, s10 1014; SDAG-NEXT: v_mov_b32_e32 v3, s11 1015; SDAG-NEXT: v_mov_b32_e32 v4, s12 1016; SDAG-NEXT: v_mov_b32_e32 v5, s13 1017; SDAG-NEXT: v_mov_b32_e32 v6, s14 1018; SDAG-NEXT: v_mov_b32_e32 v7, s15 1019; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 1020; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 1021; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 1022; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 1023; SDAG-NEXT: s_nop 1 1024; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 1025; SDAG-NEXT: s_nop 6 1026; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] 1027; SDAG-NEXT: s_endpgm 1028; 1029; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: 1030; GISEL: ; %bb.0: 1031; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 1032; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 1033; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1034; GISEL-NEXT: s_waitcnt lgkmcnt(0) 1035; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 1036; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 1037; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] 1038; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 1039; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] 1040; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 1041; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 1042; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 1043; GISEL-NEXT: s_nop 1 1044; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 1045; GISEL-NEXT: v_mov_b32_e32 v0, 0 1046; GISEL-NEXT: s_nop 5 1047; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] 1048; GISEL-NEXT: s_endpgm 1049 %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1) 1050 store <4 x i32> %result, ptr addrspace(1) %out 1051 ret void 1052} 1053 1054; -------------------------------------------------------------------- 1055; llvm.amdgcn.mfma.i32.32x32x32.i8 1056; -------------------------------------------------------------------- 1057 1058declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32>, <4 x i32>, <16 x i32>, i32 immarg, i32 immarg, i32 immarg) 1059 1060define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) #1 { 1061; SDAG-LABEL: test_mfma_i32_32x32x32_i8: 1062; SDAG: ; %bb.0: 1063; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 1064; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1065; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 1066; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 1067; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 1068; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1069; SDAG-NEXT: v_mov_b32_e32 v0, s24 1070; SDAG-NEXT: v_mov_b32_e32 v1, s25 1071; SDAG-NEXT: v_mov_b32_e32 v2, s26 1072; SDAG-NEXT: v_mov_b32_e32 v3, s27 1073; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 1074; SDAG-NEXT: v_mov_b32_e32 v4, s28 1075; SDAG-NEXT: v_mov_b32_e32 v5, s29 1076; SDAG-NEXT: v_mov_b32_e32 v6, s30 1077; SDAG-NEXT: v_mov_b32_e32 v7, s31 1078; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 1079; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 1080; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 1081; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 1082; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 1083; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 1084; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 1085; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 1086; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 1087; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 1088; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 1089; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 1090; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 1091; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 1092; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 1093; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 1094; SDAG-NEXT: s_nop 0 1095; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] 1096; SDAG-NEXT: v_mov_b32_e32 v0, s16 1097; SDAG-NEXT: v_mov_b32_e32 v1, s17 1098; SDAG-NEXT: v_mov_b32_e32 v2, s18 1099; SDAG-NEXT: v_mov_b32_e32 v3, s19 1100; SDAG-NEXT: s_nop 6 1101; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 1102; SDAG-NEXT: s_waitcnt vmcnt(0) 1103; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 1104; SDAG-NEXT: s_waitcnt vmcnt(0) 1105; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 1106; SDAG-NEXT: s_waitcnt vmcnt(0) 1107; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 1108; SDAG-NEXT: s_waitcnt vmcnt(0) 1109; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 1110; SDAG-NEXT: s_waitcnt vmcnt(0) 1111; SDAG-NEXT: s_nop 0 1112; SDAG-NEXT: v_mov_b32_e32 v0, s20 1113; SDAG-NEXT: v_mov_b32_e32 v1, s21 1114; SDAG-NEXT: v_mov_b32_e32 v2, s22 1115; SDAG-NEXT: v_mov_b32_e32 v3, s23 1116; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 1117; SDAG-NEXT: s_waitcnt vmcnt(0) 1118; SDAG-NEXT: s_nop 0 1119; SDAG-NEXT: v_mov_b32_e32 v0, s8 1120; SDAG-NEXT: v_mov_b32_e32 v1, s9 1121; SDAG-NEXT: v_mov_b32_e32 v2, s10 1122; SDAG-NEXT: v_mov_b32_e32 v3, s11 1123; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 1124; SDAG-NEXT: s_waitcnt vmcnt(0) 1125; SDAG-NEXT: s_nop 0 1126; SDAG-NEXT: v_mov_b32_e32 v0, s12 1127; SDAG-NEXT: v_mov_b32_e32 v1, s13 1128; SDAG-NEXT: v_mov_b32_e32 v2, s14 1129; SDAG-NEXT: v_mov_b32_e32 v3, s15 1130; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 1131; SDAG-NEXT: s_waitcnt vmcnt(0) 1132; SDAG-NEXT: s_endpgm 1133; 1134; GISEL-LABEL: test_mfma_i32_32x32x32_i8: 1135; GISEL: ; %bb.0: 1136; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 1137; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1138; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0 1139; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48 1140; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16 1141; GISEL-NEXT: s_waitcnt lgkmcnt(0) 1142; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 1143; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 1144; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 1145; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 1146; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 1147; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 1148; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 1149; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 1150; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 1151; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 1152; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 1153; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 1154; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 1155; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 1156; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 1157; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 1158; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 1159; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 1160; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 1161; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 1162; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] 1163; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] 1164; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] 1165; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] 1166; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] 1167; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] 1168; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32 1169; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] 1170; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] 1171; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] 1172; GISEL-NEXT: s_nop 3 1173; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 1174; GISEL-NEXT: s_waitcnt vmcnt(0) 1175; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 1176; GISEL-NEXT: s_waitcnt vmcnt(0) 1177; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1 1178; GISEL-NEXT: s_waitcnt vmcnt(0) 1179; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1 1180; GISEL-NEXT: s_waitcnt vmcnt(0) 1181; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 1182; GISEL-NEXT: s_waitcnt vmcnt(0) 1183; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 1184; GISEL-NEXT: s_waitcnt vmcnt(0) 1185; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 1186; GISEL-NEXT: s_waitcnt vmcnt(0) 1187; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 1188; GISEL-NEXT: s_waitcnt vmcnt(0) 1189; GISEL-NEXT: s_endpgm 1190 %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) 1191 store volatile <16 x i32> %result, ptr addrspace(1) null 1192 store volatile <16 x i32> %arg2, ptr addrspace(1) null 1193 ret void 1194} 1195 1196define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) #1 { 1197; SDAG-LABEL: test_mfma_i32_32x32x32_i8__flags: 1198; SDAG: ; %bb.0: 1199; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 1200; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1201; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 1202; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 1203; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 1204; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1205; SDAG-NEXT: v_mov_b32_e32 v0, s24 1206; SDAG-NEXT: v_mov_b32_e32 v1, s25 1207; SDAG-NEXT: v_mov_b32_e32 v2, s26 1208; SDAG-NEXT: v_mov_b32_e32 v3, s27 1209; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 1210; SDAG-NEXT: v_mov_b32_e32 v4, s28 1211; SDAG-NEXT: v_mov_b32_e32 v5, s29 1212; SDAG-NEXT: v_mov_b32_e32 v6, s30 1213; SDAG-NEXT: v_mov_b32_e32 v7, s31 1214; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 1215; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 1216; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 1217; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 1218; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 1219; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 1220; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 1221; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 1222; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 1223; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 1224; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 1225; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 1226; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 1227; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 1228; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 1229; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 1230; SDAG-NEXT: s_nop 0 1231; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 1232; SDAG-NEXT: v_mov_b32_e32 v0, s16 1233; SDAG-NEXT: v_mov_b32_e32 v1, s17 1234; SDAG-NEXT: v_mov_b32_e32 v2, s18 1235; SDAG-NEXT: v_mov_b32_e32 v3, s19 1236; SDAG-NEXT: s_nop 6 1237; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 1238; SDAG-NEXT: s_waitcnt vmcnt(0) 1239; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 1240; SDAG-NEXT: s_waitcnt vmcnt(0) 1241; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 1242; SDAG-NEXT: s_waitcnt vmcnt(0) 1243; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 1244; SDAG-NEXT: s_waitcnt vmcnt(0) 1245; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 1246; SDAG-NEXT: s_waitcnt vmcnt(0) 1247; SDAG-NEXT: s_nop 0 1248; SDAG-NEXT: v_mov_b32_e32 v0, s20 1249; SDAG-NEXT: v_mov_b32_e32 v1, s21 1250; SDAG-NEXT: v_mov_b32_e32 v2, s22 1251; SDAG-NEXT: v_mov_b32_e32 v3, s23 1252; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 1253; SDAG-NEXT: s_waitcnt vmcnt(0) 1254; SDAG-NEXT: s_nop 0 1255; SDAG-NEXT: v_mov_b32_e32 v0, s8 1256; SDAG-NEXT: v_mov_b32_e32 v1, s9 1257; SDAG-NEXT: v_mov_b32_e32 v2, s10 1258; SDAG-NEXT: v_mov_b32_e32 v3, s11 1259; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 1260; SDAG-NEXT: s_waitcnt vmcnt(0) 1261; SDAG-NEXT: s_nop 0 1262; SDAG-NEXT: v_mov_b32_e32 v0, s12 1263; SDAG-NEXT: v_mov_b32_e32 v1, s13 1264; SDAG-NEXT: v_mov_b32_e32 v2, s14 1265; SDAG-NEXT: v_mov_b32_e32 v3, s15 1266; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 1267; SDAG-NEXT: s_waitcnt vmcnt(0) 1268; SDAG-NEXT: s_endpgm 1269; 1270; GISEL-LABEL: test_mfma_i32_32x32x32_i8__flags: 1271; GISEL: ; %bb.0: 1272; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 1273; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1274; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0 1275; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48 1276; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16 1277; GISEL-NEXT: s_waitcnt lgkmcnt(0) 1278; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 1279; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 1280; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 1281; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 1282; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 1283; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 1284; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 1285; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 1286; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 1287; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 1288; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 1289; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 1290; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 1291; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 1292; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 1293; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 1294; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 1295; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 1296; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 1297; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 1298; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] 1299; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] 1300; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 1301; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] 1302; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] 1303; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] 1304; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32 1305; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] 1306; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] 1307; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] 1308; GISEL-NEXT: s_nop 3 1309; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 1310; GISEL-NEXT: s_waitcnt vmcnt(0) 1311; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 1312; GISEL-NEXT: s_waitcnt vmcnt(0) 1313; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1 1314; GISEL-NEXT: s_waitcnt vmcnt(0) 1315; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1 1316; GISEL-NEXT: s_waitcnt vmcnt(0) 1317; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 1318; GISEL-NEXT: s_waitcnt vmcnt(0) 1319; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 1320; GISEL-NEXT: s_waitcnt vmcnt(0) 1321; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 1322; GISEL-NEXT: s_waitcnt vmcnt(0) 1323; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 1324; GISEL-NEXT: s_waitcnt vmcnt(0) 1325; GISEL-NEXT: s_endpgm 1326 %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 2, i32 3, i32 1) 1327 store volatile <16 x i32> %result, ptr addrspace(1) null 1328 store volatile <16 x i32> %arg2, ptr addrspace(1) null 1329 ret void 1330} 1331 1332define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) { 1333; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac: 1334; GCN: ; %bb.0: 1335; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1336; GCN-NEXT: v_accvgpr_write_b32 a0, v8 1337; GCN-NEXT: v_accvgpr_write_b32 a1, v9 1338; GCN-NEXT: v_accvgpr_write_b32 a2, v10 1339; GCN-NEXT: v_accvgpr_write_b32 a3, v11 1340; GCN-NEXT: v_accvgpr_write_b32 a4, v12 1341; GCN-NEXT: v_accvgpr_write_b32 a5, v13 1342; GCN-NEXT: v_accvgpr_write_b32 a6, v14 1343; GCN-NEXT: v_accvgpr_write_b32 a7, v15 1344; GCN-NEXT: v_accvgpr_write_b32 a8, v16 1345; GCN-NEXT: v_accvgpr_write_b32 a9, v17 1346; GCN-NEXT: v_accvgpr_write_b32 a10, v18 1347; GCN-NEXT: v_accvgpr_write_b32 a11, v19 1348; GCN-NEXT: v_accvgpr_write_b32 a12, v20 1349; GCN-NEXT: v_accvgpr_write_b32 a13, v21 1350; GCN-NEXT: v_accvgpr_write_b32 a14, v22 1351; GCN-NEXT: v_accvgpr_write_b32 a15, v23 1352; GCN-NEXT: s_nop 1 1353; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] 1354; GCN-NEXT: s_nop 7 1355; GCN-NEXT: s_nop 2 1356; GCN-NEXT: v_accvgpr_read_b32 v0, a0 1357; GCN-NEXT: v_accvgpr_read_b32 v1, a1 1358; GCN-NEXT: v_accvgpr_read_b32 v2, a2 1359; GCN-NEXT: v_accvgpr_read_b32 v3, a3 1360; GCN-NEXT: v_accvgpr_read_b32 v4, a4 1361; GCN-NEXT: v_accvgpr_read_b32 v5, a5 1362; GCN-NEXT: v_accvgpr_read_b32 v6, a6 1363; GCN-NEXT: v_accvgpr_read_b32 v7, a7 1364; GCN-NEXT: v_accvgpr_read_b32 v8, a8 1365; GCN-NEXT: v_accvgpr_read_b32 v9, a9 1366; GCN-NEXT: v_accvgpr_read_b32 v10, a10 1367; GCN-NEXT: v_accvgpr_read_b32 v11, a11 1368; GCN-NEXT: v_accvgpr_read_b32 v12, a12 1369; GCN-NEXT: v_accvgpr_read_b32 v13, a13 1370; GCN-NEXT: v_accvgpr_read_b32 v14, a14 1371; GCN-NEXT: v_accvgpr_read_b32 v15, a15 1372; GCN-NEXT: s_setpc_b64 s[30:31] 1373 %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) 1374 ret <16 x i32> %result 1375} 1376 1377define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) { 1378; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: 1379; GCN: ; %bb.0: 1380; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1381; GCN-NEXT: v_accvgpr_write_b32 a0, v8 1382; GCN-NEXT: v_accvgpr_write_b32 a1, v9 1383; GCN-NEXT: v_accvgpr_write_b32 a2, v10 1384; GCN-NEXT: v_accvgpr_write_b32 a3, v11 1385; GCN-NEXT: v_accvgpr_write_b32 a4, v12 1386; GCN-NEXT: v_accvgpr_write_b32 a5, v13 1387; GCN-NEXT: v_accvgpr_write_b32 a6, v14 1388; GCN-NEXT: v_accvgpr_write_b32 a7, v15 1389; GCN-NEXT: v_accvgpr_write_b32 a8, v16 1390; GCN-NEXT: v_accvgpr_write_b32 a9, v17 1391; GCN-NEXT: v_accvgpr_write_b32 a10, v18 1392; GCN-NEXT: v_accvgpr_write_b32 a11, v19 1393; GCN-NEXT: v_accvgpr_write_b32 a12, v20 1394; GCN-NEXT: v_accvgpr_write_b32 a13, v21 1395; GCN-NEXT: v_accvgpr_write_b32 a14, v22 1396; GCN-NEXT: v_accvgpr_write_b32 a15, v23 1397; GCN-NEXT: s_nop 1 1398; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 1399; GCN-NEXT: s_nop 7 1400; GCN-NEXT: s_nop 2 1401; GCN-NEXT: v_accvgpr_read_b32 v0, a0 1402; GCN-NEXT: v_accvgpr_read_b32 v1, a1 1403; GCN-NEXT: v_accvgpr_read_b32 v2, a2 1404; GCN-NEXT: v_accvgpr_read_b32 v3, a3 1405; GCN-NEXT: v_accvgpr_read_b32 v4, a4 1406; GCN-NEXT: v_accvgpr_read_b32 v5, a5 1407; GCN-NEXT: v_accvgpr_read_b32 v6, a6 1408; GCN-NEXT: v_accvgpr_read_b32 v7, a7 1409; GCN-NEXT: v_accvgpr_read_b32 v8, a8 1410; GCN-NEXT: v_accvgpr_read_b32 v9, a9 1411; GCN-NEXT: v_accvgpr_read_b32 v10, a10 1412; GCN-NEXT: v_accvgpr_read_b32 v11, a11 1413; GCN-NEXT: v_accvgpr_read_b32 v12, a12 1414; GCN-NEXT: v_accvgpr_read_b32 v13, a13 1415; GCN-NEXT: v_accvgpr_read_b32 v14, a14 1416; GCN-NEXT: v_accvgpr_read_b32 v15, a15 1417; GCN-NEXT: s_setpc_b64 s[30:31] 1418 %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 1, i32 1) 1419 ret <16 x i32> %result 1420} 1421 1422define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 { 1423; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: 1424; SDAG: ; %bb.0: 1425; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 1426; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 1427; SDAG-NEXT: v_mov_b32_e32 v8, 0 1428; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1429; SDAG-NEXT: v_mov_b32_e32 v0, s20 1430; SDAG-NEXT: v_mov_b32_e32 v1, s21 1431; SDAG-NEXT: v_mov_b32_e32 v2, s22 1432; SDAG-NEXT: v_mov_b32_e32 v3, s23 1433; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1434; SDAG-NEXT: v_mov_b32_e32 v4, s24 1435; SDAG-NEXT: v_mov_b32_e32 v5, s25 1436; SDAG-NEXT: v_mov_b32_e32 v6, s26 1437; SDAG-NEXT: v_mov_b32_e32 v7, s27 1438; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1439; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 1440; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 1441; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 1442; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 1443; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 1444; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 1445; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 1446; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 1447; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 1448; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 1449; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 1450; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 1451; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 1452; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 1453; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 1454; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 1455; SDAG-NEXT: s_nop 1 1456; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] 1457; SDAG-NEXT: v_mov_b32_e32 v0, s20 1458; SDAG-NEXT: v_mov_b32_e32 v1, s21 1459; SDAG-NEXT: v_mov_b32_e32 v2, s22 1460; SDAG-NEXT: v_mov_b32_e32 v3, s23 1461; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 1462; SDAG-NEXT: s_waitcnt vmcnt(0) 1463; SDAG-NEXT: s_nop 0 1464; SDAG-NEXT: v_mov_b32_e32 v0, s16 1465; SDAG-NEXT: v_mov_b32_e32 v1, s17 1466; SDAG-NEXT: v_mov_b32_e32 v2, s18 1467; SDAG-NEXT: v_mov_b32_e32 v3, s19 1468; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 1469; SDAG-NEXT: s_waitcnt vmcnt(0) 1470; SDAG-NEXT: s_nop 0 1471; SDAG-NEXT: v_mov_b32_e32 v0, s12 1472; SDAG-NEXT: v_mov_b32_e32 v1, s13 1473; SDAG-NEXT: v_mov_b32_e32 v2, s14 1474; SDAG-NEXT: v_mov_b32_e32 v3, s15 1475; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 1476; SDAG-NEXT: s_waitcnt vmcnt(0) 1477; SDAG-NEXT: s_nop 0 1478; SDAG-NEXT: v_mov_b32_e32 v0, s8 1479; SDAG-NEXT: v_mov_b32_e32 v1, s9 1480; SDAG-NEXT: v_mov_b32_e32 v2, s10 1481; SDAG-NEXT: v_mov_b32_e32 v3, s11 1482; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 1483; SDAG-NEXT: s_waitcnt vmcnt(0) 1484; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 1485; SDAG-NEXT: s_waitcnt vmcnt(0) 1486; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 1487; SDAG-NEXT: s_waitcnt vmcnt(0) 1488; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 1489; SDAG-NEXT: s_waitcnt vmcnt(0) 1490; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 1491; SDAG-NEXT: s_waitcnt vmcnt(0) 1492; SDAG-NEXT: s_endpgm 1493; 1494; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: 1495; GISEL: ; %bb.0: 1496; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 1497; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1498; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 1499; GISEL-NEXT: v_mov_b32_e32 v24, 0 1500; GISEL-NEXT: s_waitcnt lgkmcnt(0) 1501; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 1502; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 1503; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 1504; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 1505; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 1506; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 1507; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 1508; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 1509; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 1510; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 1511; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 1512; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 1513; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 1514; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 1515; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 1516; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 1517; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 1518; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 1519; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 1520; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 1521; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] 1522; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] 1523; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] 1524; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] 1525; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] 1526; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] 1527; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] 1528; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] 1529; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] 1530; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 1531; GISEL-NEXT: s_waitcnt vmcnt(0) 1532; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 1533; GISEL-NEXT: s_waitcnt vmcnt(0) 1534; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 1535; GISEL-NEXT: s_waitcnt vmcnt(0) 1536; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 1537; GISEL-NEXT: s_waitcnt vmcnt(0) 1538; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 1539; GISEL-NEXT: s_waitcnt vmcnt(0) 1540; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 1541; GISEL-NEXT: s_waitcnt vmcnt(0) 1542; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 1543; GISEL-NEXT: s_waitcnt vmcnt(0) 1544; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 1545; GISEL-NEXT: s_waitcnt vmcnt(0) 1546; GISEL-NEXT: s_endpgm 1547 %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) 1548 store volatile <16 x i32> %arg2, ptr addrspace(1) %out 1549 store volatile <16 x i32> %result, ptr addrspace(1) %out 1550 ret void 1551} 1552 1553define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 { 1554; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: 1555; SDAG: ; %bb.0: 1556; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 1557; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 1558; SDAG-NEXT: v_mov_b32_e32 v8, 0 1559; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1560; SDAG-NEXT: v_mov_b32_e32 v0, s20 1561; SDAG-NEXT: v_mov_b32_e32 v1, s21 1562; SDAG-NEXT: v_mov_b32_e32 v2, s22 1563; SDAG-NEXT: v_mov_b32_e32 v3, s23 1564; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1565; SDAG-NEXT: v_mov_b32_e32 v4, s24 1566; SDAG-NEXT: v_mov_b32_e32 v5, s25 1567; SDAG-NEXT: v_mov_b32_e32 v6, s26 1568; SDAG-NEXT: v_mov_b32_e32 v7, s27 1569; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1570; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 1571; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 1572; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 1573; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 1574; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 1575; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 1576; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 1577; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 1578; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 1579; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 1580; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 1581; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 1582; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 1583; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 1584; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 1585; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 1586; SDAG-NEXT: s_nop 1 1587; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 1588; SDAG-NEXT: v_mov_b32_e32 v0, s20 1589; SDAG-NEXT: v_mov_b32_e32 v1, s21 1590; SDAG-NEXT: v_mov_b32_e32 v2, s22 1591; SDAG-NEXT: v_mov_b32_e32 v3, s23 1592; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 1593; SDAG-NEXT: s_waitcnt vmcnt(0) 1594; SDAG-NEXT: s_nop 0 1595; SDAG-NEXT: v_mov_b32_e32 v0, s16 1596; SDAG-NEXT: v_mov_b32_e32 v1, s17 1597; SDAG-NEXT: v_mov_b32_e32 v2, s18 1598; SDAG-NEXT: v_mov_b32_e32 v3, s19 1599; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 1600; SDAG-NEXT: s_waitcnt vmcnt(0) 1601; SDAG-NEXT: s_nop 0 1602; SDAG-NEXT: v_mov_b32_e32 v0, s12 1603; SDAG-NEXT: v_mov_b32_e32 v1, s13 1604; SDAG-NEXT: v_mov_b32_e32 v2, s14 1605; SDAG-NEXT: v_mov_b32_e32 v3, s15 1606; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 1607; SDAG-NEXT: s_waitcnt vmcnt(0) 1608; SDAG-NEXT: s_nop 0 1609; SDAG-NEXT: v_mov_b32_e32 v0, s8 1610; SDAG-NEXT: v_mov_b32_e32 v1, s9 1611; SDAG-NEXT: v_mov_b32_e32 v2, s10 1612; SDAG-NEXT: v_mov_b32_e32 v3, s11 1613; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 1614; SDAG-NEXT: s_waitcnt vmcnt(0) 1615; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 1616; SDAG-NEXT: s_waitcnt vmcnt(0) 1617; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 1618; SDAG-NEXT: s_waitcnt vmcnt(0) 1619; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 1620; SDAG-NEXT: s_waitcnt vmcnt(0) 1621; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 1622; SDAG-NEXT: s_waitcnt vmcnt(0) 1623; SDAG-NEXT: s_endpgm 1624; 1625; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: 1626; GISEL: ; %bb.0: 1627; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 1628; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1629; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 1630; GISEL-NEXT: v_mov_b32_e32 v24, 0 1631; GISEL-NEXT: s_waitcnt lgkmcnt(0) 1632; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 1633; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 1634; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 1635; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 1636; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 1637; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 1638; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 1639; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 1640; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 1641; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 1642; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 1643; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 1644; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 1645; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 1646; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 1647; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 1648; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 1649; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 1650; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 1651; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 1652; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] 1653; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] 1654; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 1655; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] 1656; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] 1657; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] 1658; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] 1659; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] 1660; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] 1661; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 1662; GISEL-NEXT: s_waitcnt vmcnt(0) 1663; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 1664; GISEL-NEXT: s_waitcnt vmcnt(0) 1665; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 1666; GISEL-NEXT: s_waitcnt vmcnt(0) 1667; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 1668; GISEL-NEXT: s_waitcnt vmcnt(0) 1669; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 1670; GISEL-NEXT: s_waitcnt vmcnt(0) 1671; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 1672; GISEL-NEXT: s_waitcnt vmcnt(0) 1673; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 1674; GISEL-NEXT: s_waitcnt vmcnt(0) 1675; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 1676; GISEL-NEXT: s_waitcnt vmcnt(0) 1677; GISEL-NEXT: s_endpgm 1678 %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 2, i32 3) 1679 store volatile <16 x i32> %arg2, ptr addrspace(1) %out 1680 store volatile <16 x i32> %result, ptr addrspace(1) %out 1681 ret void 1682} 1683 1684define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 { 1685; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: 1686; SDAG: ; %bb.0: 1687; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 1688; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 1689; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1690; SDAG-NEXT: v_mov_b32_e32 v0, s20 1691; SDAG-NEXT: v_mov_b32_e32 v1, s21 1692; SDAG-NEXT: v_mov_b32_e32 v2, s22 1693; SDAG-NEXT: v_mov_b32_e32 v3, s23 1694; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1695; SDAG-NEXT: v_mov_b32_e32 v4, s24 1696; SDAG-NEXT: v_mov_b32_e32 v5, s25 1697; SDAG-NEXT: v_mov_b32_e32 v6, s26 1698; SDAG-NEXT: v_mov_b32_e32 v7, s27 1699; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1700; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 1701; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 1702; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 1703; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 1704; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 1705; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 1706; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 1707; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 1708; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 1709; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 1710; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 1711; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 1712; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 1713; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 1714; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 1715; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 1716; SDAG-NEXT: s_nop 1 1717; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] 1718; SDAG-NEXT: v_mov_b32_e32 v0, 0 1719; SDAG-NEXT: s_nop 7 1720; SDAG-NEXT: s_nop 1 1721; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 1722; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 1723; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 1724; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] 1725; SDAG-NEXT: s_endpgm 1726; 1727; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: 1728; GISEL: ; %bb.0: 1729; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 1730; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1731; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 1732; GISEL-NEXT: s_waitcnt lgkmcnt(0) 1733; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 1734; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 1735; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 1736; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 1737; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 1738; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 1739; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 1740; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 1741; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 1742; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 1743; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 1744; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 1745; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 1746; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 1747; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 1748; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 1749; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 1750; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 1751; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 1752; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 1753; GISEL-NEXT: s_nop 1 1754; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] 1755; GISEL-NEXT: v_mov_b32_e32 v0, 0 1756; GISEL-NEXT: s_nop 7 1757; GISEL-NEXT: s_nop 1 1758; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] 1759; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 1760; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 1761; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 1762; GISEL-NEXT: s_endpgm 1763 %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) 1764 store <16 x i32> %result, ptr addrspace(1) %out 1765 ret void 1766} 1767 1768define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 { 1769; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: 1770; SDAG: ; %bb.0: 1771; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 1772; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 1773; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1774; SDAG-NEXT: v_mov_b32_e32 v0, s20 1775; SDAG-NEXT: v_mov_b32_e32 v1, s21 1776; SDAG-NEXT: v_mov_b32_e32 v2, s22 1777; SDAG-NEXT: v_mov_b32_e32 v3, s23 1778; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1779; SDAG-NEXT: v_mov_b32_e32 v4, s24 1780; SDAG-NEXT: v_mov_b32_e32 v5, s25 1781; SDAG-NEXT: v_mov_b32_e32 v6, s26 1782; SDAG-NEXT: v_mov_b32_e32 v7, s27 1783; SDAG-NEXT: s_waitcnt lgkmcnt(0) 1784; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 1785; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 1786; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 1787; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 1788; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 1789; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 1790; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 1791; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 1792; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 1793; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 1794; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 1795; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 1796; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 1797; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 1798; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 1799; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 1800; SDAG-NEXT: s_nop 1 1801; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 1802; SDAG-NEXT: v_mov_b32_e32 v0, 0 1803; SDAG-NEXT: s_nop 7 1804; SDAG-NEXT: s_nop 1 1805; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 1806; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 1807; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 1808; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] 1809; SDAG-NEXT: s_endpgm 1810; 1811; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: 1812; GISEL: ; %bb.0: 1813; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 1814; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 1815; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 1816; GISEL-NEXT: s_waitcnt lgkmcnt(0) 1817; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] 1818; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] 1819; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] 1820; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 1821; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] 1822; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 1823; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 1824; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 1825; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 1826; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 1827; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 1828; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 1829; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 1830; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 1831; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 1832; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 1833; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 1834; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 1835; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 1836; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 1837; GISEL-NEXT: s_nop 1 1838; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 1839; GISEL-NEXT: v_mov_b32_e32 v0, 0 1840; GISEL-NEXT: s_nop 7 1841; GISEL-NEXT: s_nop 1 1842; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] 1843; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 1844; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 1845; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 1846; GISEL-NEXT: s_endpgm 1847 %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 3, i32 2, i32 1) 1848 store <16 x i32> %result, ptr addrspace(1) %out 1849 ret void 1850} 1851 1852; -------------------------------------------------------------------- 1853; llvm.amdgcn.mfma.f32.16x16x32.bf16 1854; -------------------------------------------------------------------- 1855 1856declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) 1857 1858define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) { 1859; GCN-LABEL: test_mfma_f32_16x16x32_bf16: 1860; GCN: ; %bb.0: 1861; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1862; GCN-NEXT: v_accvgpr_write_b32 a0, v8 1863; GCN-NEXT: v_accvgpr_write_b32 a1, v9 1864; GCN-NEXT: v_accvgpr_write_b32 a2, v10 1865; GCN-NEXT: v_accvgpr_write_b32 a3, v11 1866; GCN-NEXT: s_nop 1 1867; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] 1868; GCN-NEXT: s_nop 6 1869; GCN-NEXT: v_accvgpr_read_b32 v0, a0 1870; GCN-NEXT: v_accvgpr_read_b32 v1, a1 1871; GCN-NEXT: v_accvgpr_read_b32 v2, a2 1872; GCN-NEXT: v_accvgpr_read_b32 v3, a3 1873; GCN-NEXT: s_setpc_b64 s[30:31] 1874 %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) 1875 ret <4 x float> %result 1876} 1877 1878define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) { 1879; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags: 1880; GCN: ; %bb.0: 1881; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1882; GCN-NEXT: v_accvgpr_write_b32 a0, v8 1883; GCN-NEXT: v_accvgpr_write_b32 a1, v9 1884; GCN-NEXT: v_accvgpr_write_b32 a2, v10 1885; GCN-NEXT: v_accvgpr_write_b32 a3, v11 1886; GCN-NEXT: s_nop 1 1887; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 1888; GCN-NEXT: s_nop 6 1889; GCN-NEXT: v_accvgpr_read_b32 v0, a0 1890; GCN-NEXT: v_accvgpr_read_b32 v1, a1 1891; GCN-NEXT: v_accvgpr_read_b32 v2, a2 1892; GCN-NEXT: v_accvgpr_read_b32 v3, a3 1893; GCN-NEXT: s_setpc_b64 s[30:31] 1894 %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) 1895 ret <4 x float> %result 1896} 1897 1898define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 { 1899; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: 1900; GCN: ; %bb.0: 1901; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 1902; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 1903; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1904; GCN-NEXT: v_mov_b32_e32 v8, 0 1905; GCN-NEXT: s_waitcnt lgkmcnt(0) 1906; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 1907; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 1908; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] 1909; GCN-NEXT: v_accvgpr_write_b32 a0, s0 1910; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] 1911; GCN-NEXT: v_accvgpr_write_b32 a1, s1 1912; GCN-NEXT: v_accvgpr_write_b32 a2, s2 1913; GCN-NEXT: v_accvgpr_write_b32 a3, s3 1914; GCN-NEXT: s_nop 1 1915; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] 1916; GCN-NEXT: s_nop 6 1917; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] 1918; GCN-NEXT: s_endpgm 1919 %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) 1920 store <4 x float> %result, ptr addrspace(1) %out 1921 ret void 1922} 1923 1924define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 { 1925; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: 1926; GCN: ; %bb.0: 1927; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 1928; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 1929; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1930; GCN-NEXT: v_mov_b32_e32 v8, 0 1931; GCN-NEXT: s_waitcnt lgkmcnt(0) 1932; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 1933; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 1934; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] 1935; GCN-NEXT: v_accvgpr_write_b32 a0, s0 1936; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] 1937; GCN-NEXT: v_accvgpr_write_b32 a1, s1 1938; GCN-NEXT: v_accvgpr_write_b32 a2, s2 1939; GCN-NEXT: v_accvgpr_write_b32 a3, s3 1940; GCN-NEXT: s_nop 1 1941; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 1942; GCN-NEXT: s_nop 6 1943; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] 1944; GCN-NEXT: s_endpgm 1945 %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) 1946 store <4 x float> %result, ptr addrspace(1) %out 1947 ret void 1948} 1949 1950attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } 1951attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } 1952