1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s 4 5define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { 6; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: 7; GCN: ; %bb.0: ; %entry 8; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 9; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 10; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 11; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 12; GCN-NEXT: s_waitcnt lgkmcnt(0) 13; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 14; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 15; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 16; GCN-NEXT: ds_load_b128 v[12:15], v32 offset:2064 17; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160 18; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304 19; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496 20; GCN-NEXT: ds_load_b128 v[0:3], v32 21; GCN-NEXT: ds_load_b128 v[8:11], v32 offset:2048 22; GCN-NEXT: ds_load_b128 v[16:19], v32 offset:6144 23; GCN-NEXT: ds_load_b128 v[24:27], v32 offset:12288 24; GCN-NEXT: ds_load_b128 v[32:35], v32 offset:20480 25; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0) 26; GCN-NEXT: s_waitcnt lgkmcnt(4) 27; GCN-NEXT: v_mov_b32_e32 v47, v7 28; GCN-NEXT: s_waitcnt lgkmcnt(3) 29; GCN-NEXT: v_mov_b32_e32 v55, v15 30; GCN-NEXT: s_waitcnt lgkmcnt(2) 31; GCN-NEXT: v_mov_b32_e32 v63, v23 32; GCN-NEXT: s_waitcnt lgkmcnt(1) 33; GCN-NEXT: v_mov_b32_e32 v71, v31 34; GCN-NEXT: s_waitcnt lgkmcnt(0) 35; GCN-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6 36; GCN-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4 37; GCN-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2 38; GCN-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 39; GCN-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13 40; GCN-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11 41; GCN-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9 42; GCN-NEXT: v_mov_b32_e32 v48, v8 43; GCN-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21 44; GCN-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19 45; GCN-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17 46; GCN-NEXT: v_mov_b32_e32 v56, v16 47; GCN-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29 48; GCN-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27 49; GCN-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25 50; GCN-NEXT: v_mov_b32_e32 v64, v24 51; GCN-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37 52; GCN-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35 53; GCN-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33 54; GCN-NEXT: v_mov_b32_e32 v72, v32 55; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[40:47], v[0:7], v[0:7], v[40:47] 56; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[48:55], v[8:15], v[8:15], v[48:55] 57; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[56:63], v[16:23], v[16:23], v[56:63] 58; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[64:71], v[24:31], v[24:31], v[64:71] 59; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[72:79], v[32:39], v[32:39], v[72:79] 60; GCN-NEXT: ds_store_b128 v80, v[44:47] offset:16 61; GCN-NEXT: ds_store_b128 v80, v[40:43] 62; GCN-NEXT: ds_store_b128 v81, v[52:55] offset:2064 63; GCN-NEXT: ds_store_b128 v81, v[48:51] offset:2048 64; GCN-NEXT: ds_store_b128 v81, v[60:63] offset:4112 65; GCN-NEXT: ds_store_b128 v81, v[56:59] offset:4096 66; GCN-NEXT: ds_store_b128 v81, v[68:71] offset:6160 67; GCN-NEXT: ds_store_b128 v81, v[64:67] offset:6144 68; GCN-NEXT: ds_store_b128 v81, v[76:79] offset:8208 69; GCN-NEXT: ds_store_b128 v81, v[72:75] offset:8192 70; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) 71; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(10) SyncID(0) 72; GCN-NEXT: s_endpgm 73; 74; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: 75; EXACTCUTOFF: ; %bb.0: ; %entry 76; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 77; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 78; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 79; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 80; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) 81; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 82; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 83; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 84; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v32 offset:2064 85; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160 86; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304 87; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496 88; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32 89; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v32 offset:2048 90; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v32 offset:6144 91; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v32 offset:12288 92; EXACTCUTOFF-NEXT: ds_load_b128 v[32:35], v32 offset:20480 93; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0) 94; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(4) 95; EXACTCUTOFF-NEXT: v_mov_b32_e32 v47, v7 96; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(3) 97; EXACTCUTOFF-NEXT: v_mov_b32_e32 v55, v15 98; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) 99; EXACTCUTOFF-NEXT: v_mov_b32_e32 v63, v23 100; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) 101; EXACTCUTOFF-NEXT: v_mov_b32_e32 v71, v31 102; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) 103; EXACTCUTOFF-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6 104; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4 105; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2 106; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 107; EXACTCUTOFF-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13 108; EXACTCUTOFF-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11 109; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9 110; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, v8 111; EXACTCUTOFF-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21 112; EXACTCUTOFF-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19 113; EXACTCUTOFF-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17 114; EXACTCUTOFF-NEXT: v_mov_b32_e32 v56, v16 115; EXACTCUTOFF-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29 116; EXACTCUTOFF-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27 117; EXACTCUTOFF-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25 118; EXACTCUTOFF-NEXT: v_mov_b32_e32 v64, v24 119; EXACTCUTOFF-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37 120; EXACTCUTOFF-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35 121; EXACTCUTOFF-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33 122; EXACTCUTOFF-NEXT: v_mov_b32_e32 v72, v32 123; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[40:47], v[0:7], v[0:7], v[40:47] 124; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[48:55], v[8:15], v[8:15], v[48:55] 125; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[56:63], v[16:23], v[16:23], v[56:63] 126; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[64:71], v[24:31], v[24:31], v[64:71] 127; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[72:79], v[32:39], v[32:39], v[72:79] 128; EXACTCUTOFF-NEXT: ds_store_b128 v80, v[44:47] offset:16 129; EXACTCUTOFF-NEXT: ds_store_b128 v80, v[40:43] 130; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[52:55] offset:2064 131; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[48:51] offset:2048 132; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[60:63] offset:4112 133; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[56:59] offset:4096 134; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[68:71] offset:6160 135; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[64:67] offset:6144 136; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[76:79] offset:8208 137; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[72:75] offset:8192 138; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) 139; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(10) SyncID(0) 140; EXACTCUTOFF-NEXT: s_endpgm 141entry: 142 %idx = call i32 @llvm.amdgcn.workitem.id.x() 143 %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx 144 %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr 145 %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 146 %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr 147 %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 148 %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr 149 %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 150 %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr 151 %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 152 %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr 153 %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) 154 %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) 155 %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) 156 %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) 157 %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) 158 %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx 159 store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr 160 %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 161 store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr 162 %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 163 store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr 164 %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 165 store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr 166 %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 167 store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr 168 ; 10 DS read 169 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 10, i32 0) 170 ; 5 WMMA 171 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0) 172 ; 10 DS write 173 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 10, i32 0) 174 ret void 175} 176 177define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { 178; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: 179; GCN: ; %bb.0: ; %entry 180; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 181; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 182; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 183; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 184; GCN-NEXT: s_waitcnt lgkmcnt(0) 185; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 186; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 187; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 188; GCN-NEXT: ds_load_b128 v[0:3], v17 189; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 190; GCN-NEXT: s_waitcnt lgkmcnt(0) 191; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 192; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 193; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 194; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 195; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 196; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 197; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 198; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:16 199; GCN-NEXT: ds_store_b128 v16, v[8:11] 200; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:2064 201; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:2048 202; GCN-NEXT: v_mov_b32_e32 v16, s1 203; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 204; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 205; GCN-NEXT: s_waitcnt lgkmcnt(0) 206; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 207; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 208; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 209; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 210; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 211; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 212; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 213; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2064 214; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:2048 215; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:6160 216; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:6144 217; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 218; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 219; GCN-NEXT: s_waitcnt lgkmcnt(0) 220; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 221; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 222; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 223; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 224; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 225; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 226; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 227; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:4112 228; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:4096 229; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:12304 230; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:12288 231; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 232; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 233; GCN-NEXT: s_waitcnt lgkmcnt(0) 234; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 235; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 236; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 237; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 238; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 239; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 240; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 241; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:6160 242; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:6144 243; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:20496 244; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:20480 245; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 246; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 247; GCN-NEXT: s_waitcnt lgkmcnt(0) 248; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 249; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 250; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 251; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 252; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 253; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 254; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 255; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:8208 256; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:8192 257; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 258; GCN-NEXT: s_endpgm 259; 260; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: 261; EXACTCUTOFF: ; %bb.0: ; %entry 262; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 263; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 264; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 265; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 266; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) 267; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 268; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 269; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 270; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 271; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 272; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) 273; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 274; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 275; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 276; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 277; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 278; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 279; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 280; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:16 281; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] 282; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:2064 283; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:2048 284; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 285; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 286; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 287; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) 288; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 289; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 290; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 291; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 292; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 293; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 294; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 295; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2064 296; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:2048 297; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:6160 298; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:6144 299; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 300; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 301; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) 302; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 303; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 304; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 305; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 306; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 307; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 308; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 309; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:4112 310; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:4096 311; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:12304 312; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:12288 313; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 314; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 315; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) 316; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 317; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 318; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 319; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 320; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 321; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 322; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 323; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:6160 324; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:6144 325; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:20496 326; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:20480 327; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 328; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) 329; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) 330; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 331; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 332; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 333; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 334; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 335; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] 336; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 337; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:8208 338; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:8192 339; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) 340; EXACTCUTOFF-NEXT: s_endpgm 341entry: 342 %idx = call i32 @llvm.amdgcn.workitem.id.x() 343 %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx 344 %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr 345 %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 346 %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr 347 %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 348 %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr 349 %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 350 %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr 351 %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 352 %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr 353 %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) 354 %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) 355 %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) 356 %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) 357 %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) 358 %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx 359 store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr 360 %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 361 store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr 362 %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 363 store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr 364 %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 365 store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr 366 %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 367 store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr 368 ; 2 DS read 369 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) 370 ; 1 WMMA 371 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 372 ; 2 DS write 373 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) 374 ; 2 DS read 375 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) 376 ; 1 WMMA 377 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 378 ; 2 DS write 379 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) 380 ; 2 DS read 381 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) 382 ; 1 WMMA 383 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 384 ; 2 DS write 385 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) 386 ; 2 DS read 387 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) 388 ; 1 WMMA 389 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 390 ; 2 DS write 391 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) 392 ; 2 DS read 393 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) 394 ; 1 WMMA 395 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 396 ; 2 DS write 397 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) 398 ret void 399} 400 401declare i32 @llvm.amdgcn.workitem.id.x() #2 402declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 403declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 404 405attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,32" } 406attributes #1 = { nounwind } 407attributes #2 = { nounwind readnone speculatable } 408