xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
4
5declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16)
6
7define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
8; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
9; GCN:       ; %bb.0: ; %entry
10; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
11; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
12; GCN-NEXT:    v_mov_b32_e32 v48, 0
13; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
14; GCN-NEXT:    v_and_b32_e32 v28, 0x3ff0, v0
15; GCN-NEXT:    s_wait_kmcnt 0x0
16; GCN-NEXT:    v_add_nc_u32_e32 v0, s0, v28
17; GCN-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
18; GCN-NEXT:    ds_load_b128 v[8:11], v0
19; GCN-NEXT:    ds_load_b128 v[12:15], v0 offset:512
20; GCN-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
21; GCN-NEXT:    ds_load_b128 v[20:23], v0 offset:3072
22; GCN-NEXT:    ds_load_b128 v[24:27], v0 offset:5120
23; GCN-NEXT:    ds_load_b128 v[4:7], v0 offset:11280
24; GCN-NEXT:    ds_load_b128 v[0:3], v0 offset:11264
25; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
26; GCN-NEXT:    s_wait_dscnt 0x6
27; GCN-NEXT:    v_mov_b32_e32 v31, v11
28; GCN-NEXT:    s_wait_dscnt 0x5
29; GCN-NEXT:    v_mov_b32_e32 v35, v15
30; GCN-NEXT:    s_wait_dscnt 0x4
31; GCN-NEXT:    v_mov_b32_e32 v39, v19
32; GCN-NEXT:    s_wait_dscnt 0x3
33; GCN-NEXT:    v_mov_b32_e32 v43, v23
34; GCN-NEXT:    s_wait_dscnt 0x2
35; GCN-NEXT:    v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
36; GCN-NEXT:    v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
37; GCN-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
38; GCN-NEXT:    v_mov_b32_e32 v32, v12
39; GCN-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
40; GCN-NEXT:    v_mov_b32_e32 v36, v16
41; GCN-NEXT:    v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
42; GCN-NEXT:    v_mov_b32_e32 v40, v20
43; GCN-NEXT:    v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
44; GCN-NEXT:    v_mov_b32_e32 v44, v24
45; GCN-NEXT:    s_wait_dscnt 0x0
46; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
47; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
48; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
49; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
50; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
51; GCN-NEXT:    ds_store_b128 v49, v[28:31]
52; GCN-NEXT:    ds_store_b128 v50, v[32:35] offset:512
53; GCN-NEXT:    ds_store_b128 v50, v[36:39] offset:1024
54; GCN-NEXT:    ds_store_b128 v50, v[40:43] offset:1536
55; GCN-NEXT:    ds_store_b128 v50, v[44:47] offset:2048
56; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
57; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
58; GCN-NEXT:    s_endpgm
59;
60; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
61; EXACTCUTOFF:       ; %bb.0: ; %entry
62; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
63; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
64; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v48, 0
65; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
66; EXACTCUTOFF-NEXT:    v_and_b32_e32 v28, 0x3ff0, v0
67; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
68; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v0, s0, v28
69; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
70; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v0
71; EXACTCUTOFF-NEXT:    ds_load_b128 v[12:15], v0 offset:512
72; EXACTCUTOFF-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
73; EXACTCUTOFF-NEXT:    ds_load_b128 v[20:23], v0 offset:3072
74; EXACTCUTOFF-NEXT:    ds_load_b128 v[24:27], v0 offset:5120
75; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v0 offset:11280
76; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v0 offset:11264
77; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
78; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x6
79; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v31, v11
80; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x5
81; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v35, v15
82; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x4
83; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v39, v19
84; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x3
85; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v43, v23
86; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x2
87; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
88; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
89; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
90; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v32, v12
91; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
92; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v36, v16
93; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
94; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v40, v20
95; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
96; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v44, v24
97; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
98; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
99; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
100; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
101; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
102; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
103; EXACTCUTOFF-NEXT:    ds_store_b128 v49, v[28:31]
104; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[32:35] offset:512
105; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[36:39] offset:1024
106; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[40:43] offset:1536
107; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[44:47] offset:2048
108; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
109; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
110; EXACTCUTOFF-NEXT:    s_endpgm
111entry:
112  %idx = call i32 @llvm.amdgcn.workitem.id.x()
113  %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx
114  %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
115  %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32
116  %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
117  %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64
118  %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
119  %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96
120  %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
121  %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128
122  %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
123  %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192
124  %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
125  %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
126  %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
127  %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
128  %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
129  %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
130  %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
131  store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
132  %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
133  store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
134  %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
135  store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
136  %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
137  store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
138  %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
139  store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
140  ; 7 DS read
141  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0)
142  ; 5 SWMMAC
143  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0)
144  ; 5 DS write
145  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0)
146  ret void
147}
148
149define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
150; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
151; GCN:       ; %bb.0: ; %entry
152; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
153; GCN-NEXT:    v_and_b32_e32 v16, 0x3ff, v0
154; GCN-NEXT:    v_mov_b32_e32 v18, 0
155; GCN-NEXT:    s_wait_kmcnt 0x0
156; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
157; GCN-NEXT:    v_lshl_add_u32 v17, v16, 5, s0
158; GCN-NEXT:    v_lshl_add_u32 v16, v16, 4, s1
159; GCN-NEXT:    ds_load_b128 v[8:11], v17 offset:1024
160; GCN-NEXT:    ds_load_b128 v[0:3], v17
161; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:16
162; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
163; GCN-NEXT:    s_wait_dscnt 0x2
164; GCN-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
165; GCN-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
166; GCN-NEXT:    s_wait_dscnt 0x0
167; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
168; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
169; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
170; GCN-NEXT:    ds_store_b128 v16, v[12:15]
171; GCN-NEXT:    ds_load_b128 v[8:11], v17 offset:2560
172; GCN-NEXT:    v_mov_b32_e32 v16, s1
173; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
174; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
175; GCN-NEXT:    s_wait_dscnt 0x0
176; GCN-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
177; GCN-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
178; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
179; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
180; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
181; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:512
182; GCN-NEXT:    ds_load_b128 v[8:11], v17 offset:4608
183; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
184; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
185; GCN-NEXT:    s_wait_dscnt 0x0
186; GCN-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
187; GCN-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
188; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
189; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
190; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
191; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:1024
192; GCN-NEXT:    ds_load_b128 v[8:11], v17 offset:7168
193; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
194; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
195; GCN-NEXT:    s_wait_dscnt 0x0
196; GCN-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
197; GCN-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
198; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
199; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
200; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
201; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:1536
202; GCN-NEXT:    ds_load_b128 v[8:11], v17 offset:10240
203; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
204; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
205; GCN-NEXT:    s_wait_dscnt 0x0
206; GCN-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
207; GCN-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
208; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
209; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
210; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
211; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:2048
212; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
213; GCN-NEXT:    s_endpgm
214;
215; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
216; EXACTCUTOFF:       ; %bb.0: ; %entry
217; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
218; EXACTCUTOFF-NEXT:    v_and_b32_e32 v16, 0x3ff, v0
219; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v18, 0
220; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
221; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_2)
222; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v17, v16, 5, s0
223; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v16, v16, 4, s1
224; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v17 offset:1024
225; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17
226; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:16
227; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
228; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x2
229; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
230; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
231; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
232; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
233; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
234; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
235; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15]
236; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v17 offset:2560
237; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v16, s1
238; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
239; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
240; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
241; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
242; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
243; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
244; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
245; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
246; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:512
247; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v17 offset:4608
248; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
249; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
250; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
251; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
252; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
253; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
254; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
255; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
256; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:1024
257; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v17 offset:7168
258; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
259; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
260; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
261; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
262; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
263; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
264; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
265; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
266; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:1536
267; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v17 offset:10240
268; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
269; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
270; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
271; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
272; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8
273; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
274; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18
275; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
276; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:2048
277; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
278; EXACTCUTOFF-NEXT:    s_endpgm
279entry:
280  %idx = call i32 @llvm.amdgcn.workitem.id.x()
281  %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx
282  %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
283  %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64
284  %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
285  %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96
286  %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
287  %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128
288  %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
289  %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160
290  %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
291  %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192
292  %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
293  %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
294  %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
295  %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
296  %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
297  %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
298  %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
299  store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
300  %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
301  store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
302  %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
303  store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
304  %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
305  store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
306  %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
307  store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
308  ; 3 DS read
309  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0)
310  ; 1 SWMMAC
311  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
312  ; 1 DS write
313  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
314  ; 1 DS read
315  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
316  ; 1 SWMMAC
317  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
318  ; 1 DS write
319  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
320  ; 1 DS read
321  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
322  ; 1 SWMMAC
323  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
324  ; 1 DS write
325  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
326  ; 1 DS read
327  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
328  ; 1 SWMMAC
329  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
330  ; 1 DS write
331  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
332  ; 1 DS read
333  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
334  ; 1 SWMMAC
335  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
336  ; 1 DS write
337  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
338  ret void
339}
340