1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s 4 5declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16) 6 7define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { 8; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: 9; GCN: ; %bb.0: ; %entry 10; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 11; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 12; GCN-NEXT: v_mov_b32_e32 v48, 0 13; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 14; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 15; GCN-NEXT: s_wait_kmcnt 0x0 16; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 17; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 18; GCN-NEXT: ds_load_b128 v[8:11], v0 19; GCN-NEXT: ds_load_b128 v[12:15], v0 offset:512 20; GCN-NEXT: ds_load_b128 v[16:19], v0 offset:1536 21; GCN-NEXT: ds_load_b128 v[20:23], v0 offset:3072 22; GCN-NEXT: ds_load_b128 v[24:27], v0 offset:5120 23; GCN-NEXT: ds_load_b128 v[4:7], v0 offset:11280 24; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264 25; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) 26; GCN-NEXT: s_wait_dscnt 0x6 27; GCN-NEXT: v_mov_b32_e32 v31, v11 28; GCN-NEXT: s_wait_dscnt 0x5 29; GCN-NEXT: v_mov_b32_e32 v35, v15 30; GCN-NEXT: s_wait_dscnt 0x4 31; GCN-NEXT: v_mov_b32_e32 v39, v19 32; GCN-NEXT: s_wait_dscnt 0x3 33; GCN-NEXT: v_mov_b32_e32 v43, v23 34; GCN-NEXT: s_wait_dscnt 0x2 35; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 36; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 37; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 38; GCN-NEXT: v_mov_b32_e32 v32, v12 39; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 40; GCN-NEXT: v_mov_b32_e32 v36, v16 41; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 42; GCN-NEXT: v_mov_b32_e32 v40, v20 43; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 44; GCN-NEXT: v_mov_b32_e32 v44, v24 45; GCN-NEXT: s_wait_dscnt 0x0 46; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 47; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 48; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 49; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 50; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 51; GCN-NEXT: ds_store_b128 v49, v[28:31] 52; GCN-NEXT: ds_store_b128 v50, v[32:35] offset:512 53; GCN-NEXT: ds_store_b128 v50, v[36:39] offset:1024 54; GCN-NEXT: ds_store_b128 v50, v[40:43] offset:1536 55; GCN-NEXT: ds_store_b128 v50, v[44:47] offset:2048 56; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) 57; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) 58; GCN-NEXT: s_endpgm 59; 60; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: 61; EXACTCUTOFF: ; %bb.0: ; %entry 62; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 63; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 64; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 65; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 66; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 67; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 68; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 69; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 70; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 71; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v0 offset:512 72; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v0 offset:1536 73; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v0 offset:3072 74; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v0 offset:5120 75; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v0 offset:11280 76; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264 77; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) 78; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6 79; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11 80; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5 81; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15 82; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4 83; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19 84; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3 85; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23 86; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 87; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 88; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 89; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 90; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12 91; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 92; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16 93; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 94; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20 95; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 96; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24 97; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 98; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 99; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 100; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 101; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 102; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 103; EXACTCUTOFF-NEXT: ds_store_b128 v49, v[28:31] 104; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[32:35] offset:512 105; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[36:39] offset:1024 106; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[40:43] offset:1536 107; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[44:47] offset:2048 108; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) 109; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) 110; EXACTCUTOFF-NEXT: s_endpgm 111entry: 112 %idx = call i32 @llvm.amdgcn.workitem.id.x() 113 %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx 114 %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr 115 %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32 116 %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr 117 %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64 118 %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr 119 %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96 120 %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr 121 %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128 122 %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr 123 %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192 124 %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr 125 %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) 126 %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) 127 %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) 128 %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) 129 %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) 130 %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx 131 store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr 132 %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 133 store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr 134 %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 135 store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr 136 %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 137 store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr 138 %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 139 store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr 140 ; 7 DS read 141 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0) 142 ; 5 SWMMAC 143 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0) 144 ; 5 DS write 145 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0) 146 ret void 147} 148 149define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { 150; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: 151; GCN: ; %bb.0: ; %entry 152; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 153; GCN-NEXT: v_and_b32_e32 v16, 0x3ff, v0 154; GCN-NEXT: v_mov_b32_e32 v18, 0 155; GCN-NEXT: s_wait_kmcnt 0x0 156; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) 157; GCN-NEXT: v_lshl_add_u32 v17, v16, 5, s0 158; GCN-NEXT: v_lshl_add_u32 v16, v16, 4, s1 159; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:1024 160; GCN-NEXT: ds_load_b128 v[0:3], v17 161; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 162; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) 163; GCN-NEXT: s_wait_dscnt 0x2 164; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 165; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 166; GCN-NEXT: s_wait_dscnt 0x0 167; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 168; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 169; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 170; GCN-NEXT: ds_store_b128 v16, v[12:15] 171; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:2560 172; GCN-NEXT: v_mov_b32_e32 v16, s1 173; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 174; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) 175; GCN-NEXT: s_wait_dscnt 0x0 176; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 177; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 178; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 179; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 180; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 181; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:512 182; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:4608 183; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 184; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) 185; GCN-NEXT: s_wait_dscnt 0x0 186; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 187; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 188; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 189; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 190; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 191; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1024 192; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:7168 193; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 194; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) 195; GCN-NEXT: s_wait_dscnt 0x0 196; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 197; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 198; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 199; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 200; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 201; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1536 202; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:10240 203; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 204; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) 205; GCN-NEXT: s_wait_dscnt 0x0 206; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 207; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 208; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) 209; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 210; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 211; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2048 212; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 213; GCN-NEXT: s_endpgm 214; 215; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: 216; EXACTCUTOFF: ; %bb.0: ; %entry 217; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 218; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x3ff, v0 219; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 220; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 221; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) 222; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v16, 5, s0 223; EXACTCUTOFF-NEXT: v_lshl_add_u32 v16, v16, 4, s1 224; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:1024 225; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 226; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 227; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) 228; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 229; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 230; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 231; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 232; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 233; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 234; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 235; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] 236; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:2560 237; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 238; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 239; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) 240; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 241; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 242; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 243; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 244; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 245; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 246; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:512 247; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:4608 248; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 249; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) 250; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 251; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 252; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 253; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 254; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 255; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 256; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1024 257; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:7168 258; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 259; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) 260; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 261; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 262; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 263; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 264; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 265; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 266; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1536 267; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:10240 268; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 269; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) 270; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 271; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 272; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 273; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) 274; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 275; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) 276; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2048 277; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) 278; EXACTCUTOFF-NEXT: s_endpgm 279entry: 280 %idx = call i32 @llvm.amdgcn.workitem.id.x() 281 %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx 282 %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr 283 %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64 284 %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr 285 %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96 286 %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr 287 %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128 288 %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr 289 %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160 290 %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr 291 %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192 292 %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr 293 %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) 294 %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) 295 %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) 296 %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) 297 %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) 298 %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx 299 store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr 300 %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 301 store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr 302 %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 303 store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr 304 %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 305 store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr 306 %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 307 store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr 308 ; 3 DS read 309 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0) 310 ; 1 SWMMAC 311 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 312 ; 1 DS write 313 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) 314 ; 1 DS read 315 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) 316 ; 1 SWMMAC 317 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 318 ; 1 DS write 319 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) 320 ; 1 DS read 321 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) 322 ; 1 SWMMAC 323 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 324 ; 1 DS write 325 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) 326 ; 1 DS read 327 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) 328 ; 1 SWMMAC 329 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 330 ; 1 DS write 331 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) 332 ; 1 DS read 333 call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) 334 ; 1 SWMMAC 335 call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) 336 ; 1 DS write 337 call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) 338 ret void 339} 340