xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT0 %s
3; RUN: llc -mtriple=amdgcn -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT1 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT2 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT3 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT4 %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT5 %s
8; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT6 %s
9
10define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
11; VARIANT0-LABEL: test_barrier:
12; VARIANT0:       ; %bb.0: ; %entry
13; VARIANT0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
14; VARIANT0-NEXT:    s_load_dword s4, s[4:5], 0xb
15; VARIANT0-NEXT:    s_mov_b32 s3, 0xf000
16; VARIANT0-NEXT:    s_mov_b32 s2, 0
17; VARIANT0-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
18; VARIANT0-NEXT:    v_mov_b32_e32 v2, 0
19; VARIANT0-NEXT:    v_not_b32_e32 v3, v0
20; VARIANT0-NEXT:    s_waitcnt lgkmcnt(0)
21; VARIANT0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
22; VARIANT0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
23; VARIANT0-NEXT:    s_barrier
24; VARIANT0-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
25; VARIANT0-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
26; VARIANT0-NEXT:    v_lshl_b64 v[3:4], v[3:4], 2
27; VARIANT0-NEXT:    buffer_load_dword v0, v[3:4], s[0:3], 0 addr64
28; VARIANT0-NEXT:    s_waitcnt vmcnt(0)
29; VARIANT0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
30; VARIANT0-NEXT:    s_endpgm
31;
32; VARIANT1-LABEL: test_barrier:
33; VARIANT1:       ; %bb.0: ; %entry
34; VARIANT1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
35; VARIANT1-NEXT:    s_load_dword s4, s[4:5], 0xb
36; VARIANT1-NEXT:    s_mov_b32 s3, 0xf000
37; VARIANT1-NEXT:    s_mov_b32 s2, 0
38; VARIANT1-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
39; VARIANT1-NEXT:    v_mov_b32_e32 v2, 0
40; VARIANT1-NEXT:    v_not_b32_e32 v3, v0
41; VARIANT1-NEXT:    s_waitcnt lgkmcnt(0)
42; VARIANT1-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
43; VARIANT1-NEXT:    s_barrier
44; VARIANT1-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
45; VARIANT1-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
46; VARIANT1-NEXT:    v_lshl_b64 v[3:4], v[3:4], 2
47; VARIANT1-NEXT:    s_waitcnt expcnt(0)
48; VARIANT1-NEXT:    buffer_load_dword v0, v[3:4], s[0:3], 0 addr64
49; VARIANT1-NEXT:    s_waitcnt vmcnt(0)
50; VARIANT1-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
51; VARIANT1-NEXT:    s_endpgm
52;
53; VARIANT2-LABEL: test_barrier:
54; VARIANT2:       ; %bb.0: ; %entry
55; VARIANT2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
56; VARIANT2-NEXT:    s_load_dword s2, s[4:5], 0x2c
57; VARIANT2-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
58; VARIANT2-NEXT:    s_waitcnt lgkmcnt(0)
59; VARIANT2-NEXT:    global_store_dword v2, v0, s[0:1]
60; VARIANT2-NEXT:    v_xad_u32 v0, v0, -1, s2
61; VARIANT2-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
62; VARIANT2-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
63; VARIANT2-NEXT:    v_mov_b32_e32 v3, s1
64; VARIANT2-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
65; VARIANT2-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
66; VARIANT2-NEXT:    s_waitcnt vmcnt(0)
67; VARIANT2-NEXT:    s_barrier
68; VARIANT2-NEXT:    global_load_dword v0, v[0:1], off
69; VARIANT2-NEXT:    s_waitcnt vmcnt(0)
70; VARIANT2-NEXT:    global_store_dword v2, v0, s[0:1]
71; VARIANT2-NEXT:    s_endpgm
72;
73; VARIANT3-LABEL: test_barrier:
74; VARIANT3:       ; %bb.0: ; %entry
75; VARIANT3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
76; VARIANT3-NEXT:    s_load_dword s2, s[4:5], 0x2c
77; VARIANT3-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
78; VARIANT3-NEXT:    s_waitcnt lgkmcnt(0)
79; VARIANT3-NEXT:    global_store_dword v2, v0, s[0:1]
80; VARIANT3-NEXT:    v_xad_u32 v0, v0, -1, s2
81; VARIANT3-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
82; VARIANT3-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
83; VARIANT3-NEXT:    v_mov_b32_e32 v3, s1
84; VARIANT3-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
85; VARIANT3-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
86; VARIANT3-NEXT:    s_barrier
87; VARIANT3-NEXT:    global_load_dword v0, v[0:1], off
88; VARIANT3-NEXT:    s_waitcnt vmcnt(0)
89; VARIANT3-NEXT:    global_store_dword v2, v0, s[0:1]
90; VARIANT3-NEXT:    s_endpgm
91;
92; VARIANT4-LABEL: test_barrier:
93; VARIANT4:       ; %bb.0: ; %entry
94; VARIANT4-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
95; VARIANT4-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
96; VARIANT4-NEXT:    s_delay_alu instid0(VALU_DEP_1)
97; VARIANT4-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
98; VARIANT4-NEXT:    s_wait_kmcnt 0x0
99; VARIANT4-NEXT:    v_xad_u32 v0, v2, -1, s2
100; VARIANT4-NEXT:    global_store_b32 v3, v2, s[0:1]
101; VARIANT4-NEXT:    s_wait_storecnt 0x0
102; VARIANT4-NEXT:    s_barrier_signal -1
103; VARIANT4-NEXT:    s_barrier_wait -1
104; VARIANT4-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
105; VARIANT4-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
106; VARIANT4-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
107; VARIANT4-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
108; VARIANT4-NEXT:    s_delay_alu instid0(VALU_DEP_2)
109; VARIANT4-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
110; VARIANT4-NEXT:    global_load_b32 v0, v[0:1], off
111; VARIANT4-NEXT:    s_wait_loadcnt 0x0
112; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
113; VARIANT4-NEXT:    s_endpgm
114;
115; VARIANT5-LABEL: test_barrier:
116; VARIANT5:       ; %bb.0: ; %entry
117; VARIANT5-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
118; VARIANT5-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
119; VARIANT5-NEXT:    s_delay_alu instid0(VALU_DEP_1)
120; VARIANT5-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
121; VARIANT5-NEXT:    s_wait_kmcnt 0x0
122; VARIANT5-NEXT:    v_xad_u32 v0, v2, -1, s2
123; VARIANT5-NEXT:    global_store_b32 v3, v2, s[0:1]
124; VARIANT5-NEXT:    s_barrier_signal -1
125; VARIANT5-NEXT:    s_barrier_wait -1
126; VARIANT5-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
127; VARIANT5-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
128; VARIANT5-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
129; VARIANT5-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
130; VARIANT5-NEXT:    s_delay_alu instid0(VALU_DEP_2)
131; VARIANT5-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
132; VARIANT5-NEXT:    global_load_b32 v0, v[0:1], off
133; VARIANT5-NEXT:    s_wait_loadcnt 0x0
134; VARIANT5-NEXT:    global_store_b32 v3, v0, s[0:1]
135; VARIANT5-NEXT:    s_endpgm
136;
137; VARIANT6-LABEL: test_barrier:
138; VARIANT6:       ; %bb.0: ; %entry
139; VARIANT6-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
140; VARIANT6-NEXT:    s_wait_kmcnt 0x0
141; VARIANT6-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0
142; VARIANT6-NEXT:    s_add_co_i32 s2, s2, -1
143; VARIANT6-NEXT:    s_delay_alu instid0(VALU_DEP_1)
144; VARIANT6-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_lshlrev_b32 v5, 2, v4
145; VARIANT6-NEXT:    v_sub_nc_u32_e32 v0, s2, v4
146; VARIANT6-NEXT:    global_store_b32 v5, v4, s[0:1]
147; VARIANT6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
148; VARIANT6-NEXT:    s_wait_storecnt 0x0
149; VARIANT6-NEXT:    s_barrier_signal -1
150; VARIANT6-NEXT:    s_barrier_wait -1
151; VARIANT6-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
152; VARIANT6-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
153; VARIANT6-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
154; VARIANT6-NEXT:    s_delay_alu instid0(VALU_DEP_2)
155; VARIANT6-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
156; VARIANT6-NEXT:    global_load_b32 v0, v[0:1], off
157; VARIANT6-NEXT:    s_wait_loadcnt 0x0
158; VARIANT6-NEXT:    global_store_b32 v5, v0, s[0:1]
159; VARIANT6-NEXT:    s_endpgm
160entry:
161  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
162  %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
163  store i32 %tmp, ptr addrspace(1) %tmp1
164  call void @llvm.amdgcn.s.barrier()
165  %tmp3 = sub i32 %size, 1
166  %tmp4 = sub i32 %tmp3, %tmp
167  %tmp5 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp4
168  %tmp6 = load i32, ptr addrspace(1) %tmp5
169  store i32 %tmp6, ptr addrspace(1) %tmp1
170  ret void
171}
172
173declare void @llvm.amdgcn.s.barrier() #1
174declare i32 @llvm.amdgcn.workitem.id.x() #2
175
176attributes #0 = { nounwind }
177attributes #1 = { convergent nounwind }
178attributes #2 = { nounwind readnone }
179