xref: /llvm-project/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s
6
7; Test case looks at the allocated offset of @used_by_both. It's at zero when
8; allocated by itself, but at 8 when allocated in combination with the double.
9; Redundantly also checks LDSByteSize.
10@used_by_both = addrspace(3) global i32 undef
11@used_by_kernel = addrspace(3) global i32 undef
12@used_by_function = addrspace(3) global double undef
13
14; kernel that calls no functions and uses an LDS variable allocates only that
15; variable, so accesses at at offset 0 and LDSByteSize is 4
16define amdgpu_kernel void @nocall_ideal() {
17; CHECK-LABEL: nocall_ideal:
18; CHECK:       ; %bb.0:
19; CHECK-NEXT:    v_mov_b32_e32 v0, 0
20; CHECK-NEXT:    ds_write_b32 v0, v0
21; CHECK-NEXT:    s_endpgm
22store i32 0, ptr addrspace(3) @used_by_kernel
23  ret void
24}
25; CHECK: ; LDSByteSize: 4 bytes
26
27define void @nonkernel() {
28; GFX9-LABEL: nonkernel:
29; GFX9:       ; %bb.0:
30; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31; GFX9-NEXT:    v_mov_b32_e32 v0, 0
32; GFX9-NEXT:    v_mov_b32_e32 v1, v0
33; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
34; GFX9-NEXT:    ds_write_b64 v0, v[0:1]
35; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
36; GFX9-NEXT:    s_setpc_b64 s[30:31]
37;
38; GFX10-LABEL: nonkernel:
39; GFX10:       ; %bb.0:
40; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX10-NEXT:    v_mov_b32_e32 v0, 0
42; GFX10-NEXT:    v_mov_b32_e32 v1, v0
43; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
44; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
45; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX10-NEXT:    s_setpc_b64 s[30:31]
47;
48; G_GFX9-LABEL: nonkernel:
49; G_GFX9:       ; %bb.0:
50; G_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; G_GFX9-NEXT:    v_mov_b32_e32 v2, 0
52; G_GFX9-NEXT:    v_mov_b32_e32 v3, 8
53; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
54; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0
55; G_GFX9-NEXT:    ds_write_b32 v3, v2
56; G_GFX9-NEXT:    ds_write_b64 v2, v[0:1]
57; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
58; G_GFX9-NEXT:    s_setpc_b64 s[30:31]
59;
60; G_GFX10-LABEL: nonkernel:
61; G_GFX10:       ; %bb.0:
62; G_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63; G_GFX10-NEXT:    v_mov_b32_e32 v2, 0
64; G_GFX10-NEXT:    v_mov_b32_e32 v3, 8
65; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
66; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
67; G_GFX10-NEXT:    ds_write_b32 v3, v2
68; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
69; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
70; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
71  store i32 0, ptr addrspace(3) @used_by_both
72  store double 0.0, ptr addrspace(3) @used_by_function
73  ret void
74}
75
76; Needs to allocate both variables, store to used_by_both is at sizeof(double)
77define amdgpu_kernel void @withcall() {
78; GFX9-LABEL: withcall:
79; GFX9:       ; %bb.0:
80; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
81; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
82; GFX9-NEXT:    s_mov_b32 s22, -1
83; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
84; GFX9-NEXT:    s_add_u32 s20, s20, s11
85; GFX9-NEXT:    s_addc_u32 s21, s21, 0
86; GFX9-NEXT:    s_mov_b32 s12, s8
87; GFX9-NEXT:    s_add_u32 s8, s4, 36
88; GFX9-NEXT:    s_mov_b32 s13, s9
89; GFX9-NEXT:    s_addc_u32 s9, s5, 0
90; GFX9-NEXT:    s_getpc_b64 s[4:5]
91; GFX9-NEXT:    s_add_u32 s4, s4, nonkernel@gotpcrel32@lo+4
92; GFX9-NEXT:    s_addc_u32 s5, s5, nonkernel@gotpcrel32@hi+12
93; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
94; GFX9-NEXT:    s_mov_b32 s14, s10
95; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
96; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
97; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
98; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
99; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
100; GFX9-NEXT:    s_mov_b64 s[0:1], s[20:21]
101; GFX9-NEXT:    v_mov_b32_e32 v3, 0
102; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
103; GFX9-NEXT:    s_mov_b64 s[2:3], s[22:23]
104; GFX9-NEXT:    s_mov_b32 s32, 0
105; GFX9-NEXT:    ds_write_b32 v3, v3 offset:8
106; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
107; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
108; GFX9-NEXT:    s_endpgm
109;
110; GFX10-LABEL: withcall:
111; GFX10:       ; %bb.0:
112; GFX10-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
113; GFX10-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
114; GFX10-NEXT:    s_mov_b32 s22, -1
115; GFX10-NEXT:    s_mov_b32 s23, 0x31c16000
116; GFX10-NEXT:    s_add_u32 s20, s20, s11
117; GFX10-NEXT:    s_addc_u32 s21, s21, 0
118; GFX10-NEXT:    s_mov_b32 s12, s8
119; GFX10-NEXT:    s_add_u32 s8, s4, 36
120; GFX10-NEXT:    s_mov_b32 s13, s9
121; GFX10-NEXT:    s_addc_u32 s9, s5, 0
122; GFX10-NEXT:    s_getpc_b64 s[4:5]
123; GFX10-NEXT:    s_add_u32 s4, s4, nonkernel@gotpcrel32@lo+4
124; GFX10-NEXT:    s_addc_u32 s5, s5, nonkernel@gotpcrel32@hi+12
125; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
126; GFX10-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
127; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
128; GFX10-NEXT:    v_mov_b32_e32 v3, 0
129; GFX10-NEXT:    s_mov_b32 s14, s10
130; GFX10-NEXT:    s_mov_b64 s[10:11], s[6:7]
131; GFX10-NEXT:    s_mov_b64 s[4:5], s[0:1]
132; GFX10-NEXT:    v_or3_b32 v31, v0, v1, v2
133; GFX10-NEXT:    s_mov_b64 s[6:7], s[2:3]
134; GFX10-NEXT:    s_mov_b64 s[0:1], s[20:21]
135; GFX10-NEXT:    s_mov_b64 s[2:3], s[22:23]
136; GFX10-NEXT:    s_mov_b32 s32, 0
137; GFX10-NEXT:    ds_write_b32 v3, v3 offset:8
138; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
140; GFX10-NEXT:    s_endpgm
141;
142; G_GFX9-LABEL: withcall:
143; G_GFX9:       ; %bb.0:
144; G_GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
145; G_GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
146; G_GFX9-NEXT:    s_mov_b32 s22, -1
147; G_GFX9-NEXT:    s_mov_b32 s23, 0xe00000
148; G_GFX9-NEXT:    s_add_u32 s20, s20, s11
149; G_GFX9-NEXT:    s_addc_u32 s21, s21, 0
150; G_GFX9-NEXT:    s_mov_b32 s16, s8
151; G_GFX9-NEXT:    s_add_u32 s8, s4, 36
152; G_GFX9-NEXT:    s_mov_b32 s15, s9
153; G_GFX9-NEXT:    s_addc_u32 s9, s5, 0
154; G_GFX9-NEXT:    s_mov_b64 s[12:13], s[0:1]
155; G_GFX9-NEXT:    s_getpc_b64 s[0:1]
156; G_GFX9-NEXT:    s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
157; G_GFX9-NEXT:    s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
158; G_GFX9-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x0
159; G_GFX9-NEXT:    s_mov_b32 s14, s10
160; G_GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
161; G_GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
162; G_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
163; G_GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
164; G_GFX9-NEXT:    s_mov_b64 s[0:1], s[20:21]
165; G_GFX9-NEXT:    v_mov_b32_e32 v3, 0
166; G_GFX9-NEXT:    v_mov_b32_e32 v4, 8
167; G_GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
168; G_GFX9-NEXT:    s_mov_b64 s[2:3], s[22:23]
169; G_GFX9-NEXT:    s_mov_b64 s[4:5], s[12:13]
170; G_GFX9-NEXT:    s_mov_b32 s12, s16
171; G_GFX9-NEXT:    s_mov_b32 s13, s15
172; G_GFX9-NEXT:    s_mov_b32 s32, 0
173; G_GFX9-NEXT:    ds_write_b32 v4, v3
174; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
175; G_GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
176; G_GFX9-NEXT:    s_endpgm
177;
178; G_GFX10-LABEL: withcall:
179; G_GFX10:       ; %bb.0:
180; G_GFX10-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
181; G_GFX10-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
182; G_GFX10-NEXT:    s_mov_b32 s22, -1
183; G_GFX10-NEXT:    s_mov_b32 s23, 0x31c16000
184; G_GFX10-NEXT:    s_add_u32 s20, s20, s11
185; G_GFX10-NEXT:    s_addc_u32 s21, s21, 0
186; G_GFX10-NEXT:    s_mov_b32 s16, s8
187; G_GFX10-NEXT:    s_add_u32 s8, s4, 36
188; G_GFX10-NEXT:    s_mov_b32 s15, s9
189; G_GFX10-NEXT:    s_addc_u32 s9, s5, 0
190; G_GFX10-NEXT:    s_mov_b64 s[12:13], s[0:1]
191; G_GFX10-NEXT:    s_getpc_b64 s[0:1]
192; G_GFX10-NEXT:    s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
193; G_GFX10-NEXT:    s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
194; G_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
195; G_GFX10-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x0
196; G_GFX10-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
197; G_GFX10-NEXT:    v_mov_b32_e32 v3, 0
198; G_GFX10-NEXT:    v_mov_b32_e32 v4, 8
199; G_GFX10-NEXT:    s_mov_b32 s14, s10
200; G_GFX10-NEXT:    s_mov_b64 s[10:11], s[6:7]
201; G_GFX10-NEXT:    v_or3_b32 v31, v0, v1, v2
202; G_GFX10-NEXT:    s_mov_b64 s[6:7], s[2:3]
203; G_GFX10-NEXT:    s_mov_b64 s[0:1], s[20:21]
204; G_GFX10-NEXT:    s_mov_b64 s[2:3], s[22:23]
205; G_GFX10-NEXT:    s_mov_b64 s[4:5], s[12:13]
206; G_GFX10-NEXT:    s_mov_b32 s12, s16
207; G_GFX10-NEXT:    s_mov_b32 s13, s15
208; G_GFX10-NEXT:    s_mov_b32 s32, 0
209; G_GFX10-NEXT:    ds_write_b32 v4, v3
210; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
211; G_GFX10-NEXT:    s_swappc_b64 s[30:31], s[18:19]
212; G_GFX10-NEXT:    s_endpgm
213  store i32 0, ptr addrspace(3) @used_by_both
214  call void @nonkernel()
215  ret void
216}
217; CHECK: ; LDSByteSize: 16 bytes
218
219; Previous lowering was less efficient here than necessary as the i32 used
220; by the kernel is also used by an unrelated non-kernel function. Codegen
221; is now the same as nocall_ideal.
222define amdgpu_kernel void @nocall_false_sharing() {
223; CHECK-LABEL: nocall_false_sharing:
224; CHECK:       ; %bb.0:
225; CHECK-NEXT:    v_mov_b32_e32 v0, 0
226; CHECK-NEXT:    ds_write_b32 v0, v0
227; CHECK-NEXT:    s_endpgm
228  store i32 0, ptr addrspace(3) @used_by_both
229  ret void
230}
231; CHECK: ; LDSByteSize: 4 bytes
232
233!llvm.module.flags = !{!0}
234!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
235