1; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX940_A %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940,GFX940_A %s 4 5; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: 6 7; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 8 9; Check that we do not copy agprs to vgprs and back inside the loop. 10 11; GCN: [[LOOP:.LBB[0-9_]+]]: 12; GCN-NOT: v_accvgpr 13; GFX908_A: v_mfma_f32_32x32x1f32 14; GFX940: v_mfma_f32_32x32x1_2b_f32 15; GCN-NOT: v_accvgpr 16; GCN: s_cbranch_scc1 [[LOOP]] 17 18; Final result should be read only once after the loop. 19 20; GFX908-COUNT-32: v_accvgpr_read_b32 21; GFX90A-NOT: v_accvgpr_read_b32 22; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 23; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 24 25define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { 26entry: 27 br label %for.cond.preheader 28 29for.cond.preheader: 30 %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] 31 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 32 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 33 %inc = add nuw nsw i32 %c, 1 34 %cc = icmp eq i32 %inc, 16 35 br i1 %cc, label %exit, label %for.cond.preheader 36 37exit: 38 store <32 x float> %mai.1, ptr addrspace(1) %arg 39 ret void 40} 41 42; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat: 43 44; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 45; 3 vgprs are needed to avoid wait states between writes. 46; Check that we do not use 32 temp sgprs as well. 47 48; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 49; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 50 51; GCN: [[LOOP:.LBB[0-9_]+]]: 52; GCN-NOT: v_accvgpr 53; GFX908_A: v_mfma_f32_32x32x1f32 54; GFX940: v_mfma_f32_32x32x1_2b_f32 55; GCN-NOT: v_accvgpr 56; GCN: s_cbranch_scc1 [[LOOP]] 57 58; GFX908-COUNT-32: v_accvgpr_read_b32 59; GFX90A-NOT: v_accvgpr_read_b32 60; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 61; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 62 63define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg) #0 { 64entry: 65 br label %for.cond.preheader 66 67for.cond.preheader: 68 %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ] 69 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 70 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 71 %inc = add nuw nsw i32 %c, 1 72 %cc = icmp eq i32 %inc, 16 73 br i1 %cc, label %exit, label %for.cond.preheader 74 75exit: 76 store <32 x float> %mai.1, ptr addrspace(1) %arg 77 ret void 78} 79 80; GCN-LABEL: {{^}}test_mfma_loop_non_splat: 81 82; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 83; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} 84 85; GCN: [[LOOP:.LBB[0-9_]+]]: 86; GCN-NOT: v_accvgpr 87; GFX908_A: v_mfma_f32_32x32x1f32 88; GFX940: v_mfma_f32_32x32x1_2b_f32 89; GCN-NOT: v_accvgpr 90; GCN: s_cbranch_scc1 [[LOOP]] 91 92; GFX908-COUNT-32: v_accvgpr_read_b32 93; GFX90A-NOT: v_accvgpr_read_b32 94; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 95; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 96 97define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { 98entry: 99 br label %for.cond.preheader 100 101for.cond.preheader: 102 %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ] 103 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 104 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 105 %inc = add nuw nsw i32 %c, 1 106 %cc = icmp eq i32 %inc, 16 107 br i1 %cc, label %exit, label %for.cond.preheader 108 109exit: 110 store <32 x float> %mai.1, ptr addrspace(1) %arg 111 ret void 112} 113 114; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq: 115 116; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 117; 3 vgprs are needed to avoid wait states between writes. 118 119; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 120; GFX-908: v_mov_b32_e32 v0, 0x42f80000 121; GFX-908: s_nop 1 122; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 123; GFX-908: v_mov_b32_e32 v0, 0x42fa0000 124; GFX-908: s_nop 1 125; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 126; GFX-908: v_mov_b32_e32 v0, 0x42fc0000 127; GFX-908: s_nop 1 128; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 129; GFX-908: v_mov_b32_e32 v0, 0x42fe0000 130; GFX-908: s_nop 1 131; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 132; GFX-908: v_mov_b32_e32 v0, 0x43000000 133; GFX-908: s_nop 1 134; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 135; GFX-908: v_mov_b32_e32 v0, 0x43010000 136; GFX-908: s_nop 1 137; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 138; GFX-908: v_mov_b32_e32 v0, 0x43020000 139; GFX-908: s_nop 1 140; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 141; GFX-908: v_mov_b32_e32 v0, 0x43030000 142; GFX-908: s_nop 1 143; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 144; GFX-908: v_mov_b32_e32 v0, 0x43040000 145; GFX-908: s_nop 1 146; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 147; GFX-908: v_mov_b32_e32 v0, 0x43050000 148; GFX-908: s_nop 1 149; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 150; GFX-908: v_mov_b32_e32 v0, 0x43060000 151; GFX-908: s_nop 1 152; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 153; GFX-908: v_mov_b32_e32 v0, 0x43070000 154; GFX-908: s_nop 1 155; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 156; GFX-908: v_mov_b32_e32 v0, 0x43080000 157; GFX-908: s_nop 1 158; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 159; GFX-908: v_mov_b32_e32 v0, 0x43090000 160; GFX-908: s_nop 1 161; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 162; GFX-908: v_mov_b32_e32 v0, 0x430a0000 163; GFX-908: s_nop 1 164; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 165; GFX-908: v_mov_b32_e32 v0, 0x430b0000 166; GFX-908: s_nop 1 167; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 168; GFX-908: v_mov_b32_e32 v0, 0x430c0000 169; GFX-908: s_nop 1 170; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 171; GFX-908: v_mov_b32_e32 v0, 0x430d0000 172; GFX-908: s_nop 1 173; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 174; GFX-908: v_mov_b32_e32 v0, 0x430e0000 175; GFX-908: s_nop 1 176; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 177; GFX-908: v_mov_b32_e32 v0, 0x430f0000 178; GFX-908: s_nop 1 179; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 180; GFX-908: v_mov_b32_e32 v0, 0x43100000 181; GFX-908: s_nop 1 182; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 183; GFX-908: v_mov_b32_e32 v0, 0x43110000 184; GFX-908: s_nop 1 185; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 186; GFX-908: v_mov_b32_e32 v0, 0x43120000 187; GFX-908: s_nop 1 188; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 189; GFX-908: v_mov_b32_e32 v0, 0x43130000 190; GFX-908: s_nop 1 191; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 192; GFX-908: v_mov_b32_e32 v0, 0x43140000 193; GFX-908: s_nop 1 194; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 195; GFX-908: v_mov_b32_e32 v0, 0x43150000 196; GFX-908: s_nop 1 197; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 198; GFX-908: v_mov_b32_e32 v0, 0x43160000 199; GFX-908: s_nop 1 200; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 201; GFX-908: v_mov_b32_e32 v0, 0x43170000 202; GFX-908: s_nop 1 203; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 204; GFX-908: v_mov_b32_e32 v0, 0x43180000 205; GFX-908: s_nop 1 206; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 207; GFX-908: v_mov_b32_e32 v0, 0x43190000 208; GFX-908: s_nop 1 209; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 210; GFX-908: v_mov_b32_e32 v0, 0x431a0000 211; GFX-908: s_nop 1 212; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 213 214; FIXME: Constant is now in VGPR instead of SGPR. 215 216; GFX940_A: v_mov_b32_e32 v{{[0-9]+}}, 0x4{{[0-9a-f]+}} 217; GFX940_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 218 219; GCN: [[LOOP:.LBB[0-9_]+]]: 220; GCN-NOT: v_accvgpr 221; GFX908_A: v_mfma_f32_32x32x1f32 222; GFX940: v_mfma_f32_32x32x1_2b_f32 223; GCN-NOT: v_accvgpr 224; GCN: s_cbranch_scc1 [[LOOP]] 225 226; GFX908-COUNT-32: v_accvgpr_read_b32 227; GFX90A-NOT: v_accvgpr_read_b32 228; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 229; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 230 231define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) #0 { 232entry: 233 br label %for.cond.preheader 234 235for.cond.preheader: 236 %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ] 237 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 238 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 239 %inc = add nuw nsw i32 %c, 1 240 %cc = icmp eq i32 %inc, 16 241 br i1 %cc, label %exit, label %for.cond.preheader 242 243exit: 244 store <32 x float> %mai.1, ptr addrspace(1) %arg 245 ret void 246} 247 248; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init: 249 250; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}} 251 252; GCN: [[LOOP:.LBB[0-9_]+]]: 253; GCN-NOT: v_accvgpr 254; GFX908_A: v_mfma_f32_32x32x1f32 255; GFX940: v_mfma_f32_32x32x1_2b_f32 256; GCN-NOT: v_accvgpr 257; GCN: s_cbranch_scc1 [[LOOP]] 258 259; GFX908-COUNT-32: v_accvgpr_read_b32 260; GFX90A-NOT: v_accvgpr_read_b32 261; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 262; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 263 264define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { 265entry: 266 %tid = call i32 @llvm.amdgcn.workitem.id.x() 267 %init = bitcast i32 %tid to float 268 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 269 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 270 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 271 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 272 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 273 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 274 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 275 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 276 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 277 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 278 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 279 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 280 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 281 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 282 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 283 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 284 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 285 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 286 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 287 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 288 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 289 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 290 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 291 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 292 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 293 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 294 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 295 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 296 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 297 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 298 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 299 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 300 301 br label %for.cond.preheader 302 303for.cond.preheader: 304 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 305 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 306 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 307 %inc = add nuw nsw i32 %c, 1 308 %cc = icmp eq i32 %inc, 16 309 br i1 %cc, label %exit, label %for.cond.preheader 310 311exit: 312 store <32 x float> %mai.1, ptr addrspace(1) %arg 313 ret void 314} 315 316; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init: 317 318; GFX908: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 319; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 320; GFX940_A-COUNT-32: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}} 321 322; GCN: [[LOOP:.LBB[0-9_]+]]: 323; GCN-NOT: v_accvgpr 324; GFX908_A: v_mfma_f32_32x32x1f32 325; GFX940: v_mfma_f32_32x32x1_2b_f32 326; GCN-NOT: v_accvgpr 327; GCN: s_cbranch_scc1 [[LOOP]] 328 329; GFX908-COUNT-32: v_accvgpr_read_b32 330; GFX90A-NOT: v_accvgpr_read_b32 331; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 332; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 333 334define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float %init) #0 { 335entry: 336 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 337 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 338 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 339 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 340 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 341 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 342 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 343 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 344 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 345 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 346 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 347 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 348 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 349 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 350 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 351 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 352 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 353 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 354 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 355 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 356 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 357 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 358 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 359 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 360 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 361 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 362 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 363 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 364 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 365 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 366 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 367 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 368 369 br label %for.cond.preheader 370 371for.cond.preheader: 372 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 373 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 374 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 375 %inc = add nuw nsw i32 %c, 1 376 %cc = icmp eq i32 %inc, 16 377 br i1 %cc, label %exit, label %for.cond.preheader 378 379exit: 380 store <32 x float> %mai.1, ptr addrspace(1) %arg 381 ret void 382} 383 384; GCN-LABEL: {{^}}test_mfma_loop_mixed_init: 385 386; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0 387; GFX908-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 388; GFX940_A-DAG: s_load_dword [[TMP:s[0-9]+]], 389; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 390; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 391; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 392; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 393; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 394; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 395; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 396; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 397; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 398; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 399; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 400; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 401; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 402; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 403; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 404; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 405; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 406; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 407; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 408; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 409; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 410; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 411; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 412; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 413; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 414; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 415; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 416; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 417; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 418; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 419; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 420 421; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 422; GFX90A-COUNT-28: v_accvgpr_write_b32 a{{[0-9]+}}, 0 423 424; GCN: [[LOOP:.LBB[0-9_]+]]: 425; GCN-NOT: v_accvgpr 426; GFX908_A: v_mfma_f32_32x32x1f32 427; GFX940: v_mfma_f32_32x32x1_2b_f32 428; GCN-NOT: v_accvgpr 429; GCN: s_cbranch_scc1 [[LOOP]] 430 431; GFX908-COUNT-32: v_accvgpr_read_b32 432; GFX90A-NOT: v_accvgpr_read_b32 433; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 434; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 435 436define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, float %x) #0 { 437entry: 438 %tid = call i32 @llvm.amdgcn.workitem.id.x() 439 %init = bitcast i32 %tid to float 440 %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0 441 %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1 442 443 br label %for.cond.preheader 444 445for.cond.preheader: 446 %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ] 447 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 448 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 449 %inc = add nuw nsw i32 %c, 1 450 %cc = icmp eq i32 %inc, 16 451 br i1 %cc, label %exit, label %for.cond.preheader 452 453exit: 454 store <32 x float> %mai.1, ptr addrspace(1) %arg 455 ret void 456} 457 458; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init: 459 460; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 461; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] 462; GFX90A-NOT: v_accvgpr 463; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 464; GFX90A-NOT: v_accvgpr 465; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 466; GCN-NOT: v_accvgpr 467 468; GCN: [[LOOP:.LBB[0-9_]+]]: 469; GCN-NOT: v_accvgpr 470; GFX908_A: v_mfma_f32_32x32x1f32 471; GFX940: v_mfma_f32_32x32x1_2b_f32 472; GCN-NOT: v_accvgpr 473; GCN: s_cbranch_scc1 [[LOOP]] 474 475; GFX908-COUNT-32: v_accvgpr_read_b32 476; GFX90A-NOT: v_accvgpr_read_b32 477; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 478; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 479 480define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %arg) #0 { 481entry: 482 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 483 484 br label %for.cond.preheader 485 486for.cond.preheader: 487 %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ] 488 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 489 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 490 %inc = add nuw nsw i32 %c, 1 491 %cc = icmp eq i32 %inc, 16 492 br i1 %cc, label %exit, label %for.cond.preheader 493 494exit: 495 store <32 x float> %mai.1, ptr addrspace(1) %arg 496 ret void 497} 498 499; GCN-LABEL: {{^}}test_mfma_loop_agpr_init: 500 501; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 502; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] 503; GFX90A-NOT: v_accvgpr 504; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 505; GFX90A-NOT: v_accvgpr 506; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 507 508; Check that we are using only one tmp VGPR. 509 510; GFX908: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} 511; GFX940_A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 512 513; GCN: [[LOOP:.LBB[0-9_]+]]: 514; GCN-NOT: v_accvgpr 515; GFX908_A: v_mfma_f32_32x32x1f32 516; GFX940: v_mfma_f32_32x32x1_2b_f32 517; GCN-NOT: v_accvgpr 518; GCN: s_cbranch_scc1 [[LOOP]] 519 520; GFX908-COUNT-32: v_accvgpr_read_b32 521; GFX90A-NOT: v_accvgpr_read_b32 522; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 523; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 524 525define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { 526entry: 527 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 528 %init = extractelement <32 x float> %mai.0, i32 0 529 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 530 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 531 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 532 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 533 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 534 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 535 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 536 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 537 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 538 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 539 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 540 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 541 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 542 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 543 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 544 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 545 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 546 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 547 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 548 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 549 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 550 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 551 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 552 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 553 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 554 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 555 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 556 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 557 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 558 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 559 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 560 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 561 562 br label %for.cond.preheader 563 564for.cond.preheader: 565 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 566 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 567 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 568 %inc = add nuw nsw i32 %c, 1 569 %cc = icmp eq i32 %inc, 16 570 br i1 %cc, label %exit, label %for.cond.preheader 571 572exit: 573 store <32 x float> %mai.1, ptr addrspace(1) %arg 574 ret void 575} 576 577; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit: 578 579; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 580; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 581; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 582 583; Check that we do not copy agprs to vgprs and back in an outer loop. 584 585; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]: 586; GCN-NOT: v_accvgpr 587; GCN: [[INNER_LOOP:.LBB[0-9_]+]]: 588; GCN-NOT: v_accvgpr 589; GFX908_A: v_mfma_f32_32x32x1f32 590; GFX940: v_mfma_f32_32x32x1_2b_f32 591; GCN-NOT: v_accvgpr 592; GCN: s_cbranch_scc1 [[INNER_LOOP]] 593; GCN-NOT: v_accvgpr 594; GCN: s_cbranch_scc1 [[OUTER_LOOP]] 595 596; Final result should be read only once after the loop. 597 598; GFX908-COUNT-32: v_accvgpr_read_b32 599; GFX90A-NOT: v_accvgpr_read_b32 600; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 601; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 602 603define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) #0 { 604entry: 605 br label %for.cond.preheader 606 607for.cond.preheader: 608 %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ] 609 %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ] 610 br label %inner.for.cond.preheader 611 612inner.for.cond.preheader: 613 %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ] 614 %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ] 615 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 616 %inc = add nuw nsw i32 %c, 1 617 %cc = icmp eq i32 %inc, 16 618 br i1 %cc, label %inner.exit, label %inner.for.cond.preheader 619 620inner.exit: 621 %inc.0 = add nuw nsw i32 %c.0, 1 622 %cc.0 = icmp eq i32 %inc.0, 16 623 br i1 %cc.0, label %exit, label %for.cond.preheader 624 625exit: 626 store <32 x float> %mai.1, ptr addrspace(1) %arg 627 ret void 628} 629 630declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 631declare i32 @llvm.amdgcn.workitem.id.x() 632 633attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } 634