xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
4
5define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
6; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster:
7; GCN:       ; %bb.0: ; %entry
8; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
9; GCN-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
10; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
11; GCN-NEXT:    v_and_b32_e32 v40, 0x7fe0, v0
12; GCN-NEXT:    s_waitcnt lgkmcnt(0)
13; GCN-NEXT:    v_add_nc_u32_e32 v32, s0, v40
14; GCN-NEXT:    v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
15; GCN-NEXT:    ds_load_b128 v[4:7], v32 offset:16
16; GCN-NEXT:    ds_load_b128 v[12:15], v32 offset:2064
17; GCN-NEXT:    ds_load_b128 v[20:23], v32 offset:6160
18; GCN-NEXT:    ds_load_b128 v[28:31], v32 offset:12304
19; GCN-NEXT:    ds_load_b128 v[36:39], v32 offset:20496
20; GCN-NEXT:    ds_load_b128 v[0:3], v32
21; GCN-NEXT:    ds_load_b128 v[8:11], v32 offset:2048
22; GCN-NEXT:    ds_load_b128 v[16:19], v32 offset:6144
23; GCN-NEXT:    ds_load_b128 v[24:27], v32 offset:12288
24; GCN-NEXT:    ds_load_b128 v[32:35], v32 offset:20480
25; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(10) SyncID(0)
26; GCN-NEXT:    s_waitcnt lgkmcnt(4)
27; GCN-NEXT:    v_mov_b32_e32 v47, v7
28; GCN-NEXT:    s_waitcnt lgkmcnt(3)
29; GCN-NEXT:    v_mov_b32_e32 v55, v15
30; GCN-NEXT:    s_waitcnt lgkmcnt(2)
31; GCN-NEXT:    v_mov_b32_e32 v63, v23
32; GCN-NEXT:    s_waitcnt lgkmcnt(1)
33; GCN-NEXT:    v_mov_b32_e32 v71, v31
34; GCN-NEXT:    s_waitcnt lgkmcnt(0)
35; GCN-NEXT:    v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6
36; GCN-NEXT:    v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4
37; GCN-NEXT:    v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2
38; GCN-NEXT:    v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0
39; GCN-NEXT:    v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13
40; GCN-NEXT:    v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11
41; GCN-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9
42; GCN-NEXT:    v_mov_b32_e32 v48, v8
43; GCN-NEXT:    v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21
44; GCN-NEXT:    v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19
45; GCN-NEXT:    v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17
46; GCN-NEXT:    v_mov_b32_e32 v56, v16
47; GCN-NEXT:    v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29
48; GCN-NEXT:    v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27
49; GCN-NEXT:    v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25
50; GCN-NEXT:    v_mov_b32_e32 v64, v24
51; GCN-NEXT:    v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37
52; GCN-NEXT:    v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35
53; GCN-NEXT:    v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33
54; GCN-NEXT:    v_mov_b32_e32 v72, v32
55; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[40:47], v[0:7], v[0:7], v[40:47]
56; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[48:55], v[8:15], v[8:15], v[48:55]
57; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[56:63], v[16:23], v[16:23], v[56:63]
58; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[64:71], v[24:31], v[24:31], v[64:71]
59; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[72:79], v[32:39], v[32:39], v[72:79]
60; GCN-NEXT:    ds_store_b128 v80, v[44:47] offset:16
61; GCN-NEXT:    ds_store_b128 v80, v[40:43]
62; GCN-NEXT:    ds_store_b128 v81, v[52:55] offset:2064
63; GCN-NEXT:    ds_store_b128 v81, v[48:51] offset:2048
64; GCN-NEXT:    ds_store_b128 v81, v[60:63] offset:4112
65; GCN-NEXT:    ds_store_b128 v81, v[56:59] offset:4096
66; GCN-NEXT:    ds_store_b128 v81, v[68:71] offset:6160
67; GCN-NEXT:    ds_store_b128 v81, v[64:67] offset:6144
68; GCN-NEXT:    ds_store_b128 v81, v[76:79] offset:8208
69; GCN-NEXT:    ds_store_b128 v81, v[72:75] offset:8192
70; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
71; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(10) SyncID(0)
72; GCN-NEXT:    s_endpgm
73;
74; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster:
75; EXACTCUTOFF:       ; %bb.0: ; %entry
76; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
77; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
78; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
79; EXACTCUTOFF-NEXT:    v_and_b32_e32 v40, 0x7fe0, v0
80; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
81; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v32, s0, v40
82; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
83; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v32 offset:16
84; EXACTCUTOFF-NEXT:    ds_load_b128 v[12:15], v32 offset:2064
85; EXACTCUTOFF-NEXT:    ds_load_b128 v[20:23], v32 offset:6160
86; EXACTCUTOFF-NEXT:    ds_load_b128 v[28:31], v32 offset:12304
87; EXACTCUTOFF-NEXT:    ds_load_b128 v[36:39], v32 offset:20496
88; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v32
89; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v32 offset:2048
90; EXACTCUTOFF-NEXT:    ds_load_b128 v[16:19], v32 offset:6144
91; EXACTCUTOFF-NEXT:    ds_load_b128 v[24:27], v32 offset:12288
92; EXACTCUTOFF-NEXT:    ds_load_b128 v[32:35], v32 offset:20480
93; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(10) SyncID(0)
94; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(4)
95; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v47, v7
96; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(3)
97; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v55, v15
98; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(2)
99; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v63, v23
100; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(1)
101; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v71, v31
102; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
103; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6
104; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4
105; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2
106; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0
107; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13
108; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11
109; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9
110; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v48, v8
111; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21
112; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19
113; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17
114; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v56, v16
115; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29
116; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27
117; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25
118; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v64, v24
119; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37
120; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35
121; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33
122; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v72, v32
123; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[40:47], v[0:7], v[0:7], v[40:47]
124; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[48:55], v[8:15], v[8:15], v[48:55]
125; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[56:63], v[16:23], v[16:23], v[56:63]
126; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[64:71], v[24:31], v[24:31], v[64:71]
127; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[72:79], v[32:39], v[32:39], v[72:79]
128; EXACTCUTOFF-NEXT:    ds_store_b128 v80, v[44:47] offset:16
129; EXACTCUTOFF-NEXT:    ds_store_b128 v80, v[40:43]
130; EXACTCUTOFF-NEXT:    ds_store_b128 v81, v[52:55] offset:2064
131; EXACTCUTOFF-NEXT:    ds_store_b128 v81, v[48:51] offset:2048
132; EXACTCUTOFF-NEXT:    ds_store_b128 v81, v[60:63] offset:4112
133; EXACTCUTOFF-NEXT:    ds_store_b128 v81, v[56:59] offset:4096
134; EXACTCUTOFF-NEXT:    ds_store_b128 v81, v[68:71] offset:6160
135; EXACTCUTOFF-NEXT:    ds_store_b128 v81, v[64:67] offset:6144
136; EXACTCUTOFF-NEXT:    ds_store_b128 v81, v[76:79] offset:8208
137; EXACTCUTOFF-NEXT:    ds_store_b128 v81, v[72:75] offset:8192
138; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
139; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(10) SyncID(0)
140; EXACTCUTOFF-NEXT:    s_endpgm
141entry:
142  %idx = call i32 @llvm.amdgcn.workitem.id.x()
143  %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx
144  %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr
145  %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64
146  %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr
147  %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128
148  %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr
149  %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192
150  %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr
151  %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256
152  %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr
153  %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0)
154  %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0)
155  %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0)
156  %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0)
157  %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0)
158  %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx
159  store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr
160  %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64
161  store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr
162  %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128
163  store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr
164  %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192
165  store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr
166  %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256
167  store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr
168  ; 10 DS read
169  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 10, i32 0)
170  ; 5 WMMA
171  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0)
172  ; 10 DS write
173  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 10, i32 0)
174  ret void
175}
176
177define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
178; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave:
179; GCN:       ; %bb.0: ; %entry
180; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
181; GCN-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
182; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
183; GCN-NEXT:    v_and_b32_e32 v16, 0x7fe0, v0
184; GCN-NEXT:    s_waitcnt lgkmcnt(0)
185; GCN-NEXT:    v_add_nc_u32_e32 v17, s0, v16
186; GCN-NEXT:    v_add_nc_u32_e32 v16, s1, v16
187; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:16
188; GCN-NEXT:    ds_load_b128 v[0:3], v17
189; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
190; GCN-NEXT:    s_waitcnt lgkmcnt(0)
191; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
192; GCN-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
193; GCN-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
194; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
195; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
196; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
197; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
198; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:16
199; GCN-NEXT:    ds_store_b128 v16, v[8:11]
200; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:2064
201; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:2048
202; GCN-NEXT:    v_mov_b32_e32 v16, s1
203; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
204; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
205; GCN-NEXT:    s_waitcnt lgkmcnt(0)
206; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
207; GCN-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
208; GCN-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
209; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
210; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
211; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
212; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
213; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:2064
214; GCN-NEXT:    ds_store_b128 v16, v[8:11] offset:2048
215; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:6160
216; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:6144
217; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
218; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
219; GCN-NEXT:    s_waitcnt lgkmcnt(0)
220; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
221; GCN-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
222; GCN-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
223; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
224; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
225; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
226; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
227; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:4112
228; GCN-NEXT:    ds_store_b128 v16, v[8:11] offset:4096
229; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:12304
230; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:12288
231; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
232; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
233; GCN-NEXT:    s_waitcnt lgkmcnt(0)
234; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
235; GCN-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
236; GCN-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
237; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
238; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
239; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
240; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
241; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:6160
242; GCN-NEXT:    ds_store_b128 v16, v[8:11] offset:6144
243; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:20496
244; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:20480
245; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
246; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
247; GCN-NEXT:    s_waitcnt lgkmcnt(0)
248; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
249; GCN-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
250; GCN-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
251; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
252; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
253; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
254; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
255; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:8208
256; GCN-NEXT:    ds_store_b128 v16, v[8:11] offset:8192
257; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
258; GCN-NEXT:    s_endpgm
259;
260; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave:
261; EXACTCUTOFF:       ; %bb.0: ; %entry
262; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
263; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
264; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
265; EXACTCUTOFF-NEXT:    v_and_b32_e32 v16, 0x7fe0, v0
266; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
267; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v17, s0, v16
268; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v16, s1, v16
269; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:16
270; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17
271; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
272; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
273; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
274; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
275; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
276; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
277; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
278; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
279; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
280; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:16
281; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11]
282; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:2064
283; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:2048
284; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v16, s1
285; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
286; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
287; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
288; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
289; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
290; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
291; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
292; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
293; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
294; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
295; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:2064
296; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11] offset:2048
297; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:6160
298; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:6144
299; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
300; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
301; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
302; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
303; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
304; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
305; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
306; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
307; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
308; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
309; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:4112
310; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11] offset:4096
311; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:12304
312; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:12288
313; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
314; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
315; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
316; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
317; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
318; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
319; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
320; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
321; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
322; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
323; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:6160
324; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11] offset:6144
325; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:20496
326; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:20480
327; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
328; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
329; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
330; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
331; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
332; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
333; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
334; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
335; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
336; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
337; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:8208
338; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11] offset:8192
339; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
340; EXACTCUTOFF-NEXT:    s_endpgm
341entry:
342  %idx = call i32 @llvm.amdgcn.workitem.id.x()
343  %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx
344  %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr
345  %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64
346  %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr
347  %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128
348  %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr
349  %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192
350  %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr
351  %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256
352  %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr
353  %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0)
354  %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0)
355  %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0)
356  %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0)
357  %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0)
358  %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx
359  store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr
360  %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64
361  store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr
362  %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128
363  store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr
364  %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192
365  store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr
366  %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256
367  store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr
368  ; 2 DS read
369  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0)
370  ; 1 WMMA
371  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
372  ; 2 DS write
373  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0)
374  ; 2 DS read
375  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0)
376  ; 1 WMMA
377  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
378  ; 2 DS write
379  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0)
380  ; 2 DS read
381  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0)
382  ; 1 WMMA
383  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
384  ; 2 DS write
385  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0)
386  ; 2 DS read
387  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0)
388  ; 1 WMMA
389  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
390  ; 2 DS write
391  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0)
392  ; 2 DS read
393  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0)
394  ; 1 WMMA
395  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
396  ; 2 DS write
397  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0)
398  ret void
399}
400
401declare i32 @llvm.amdgcn.workitem.id.x() #2
402declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1
403declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
404
405attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,32" }
406attributes #1 = { nounwind }
407attributes #2 = { nounwind readnone speculatable }
408