xref: /llvm-project/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll (revision c93e001ca695e905cb965b36d63f7a348d1dd809)
1; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
2; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
3
4; Does not apply to wave64
5; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
6; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
7
8; Does not apply to gfx1101
9; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
10; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
11
12; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
13; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
14
15; Does not apply to gfx1103
16; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1103 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
17; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1103 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
18
19; There aren't any stack objects, but we still enable the
20; private_segment_wavefront_offset to get to 16, and the workgroup ID
21; is in s14.
22
23; private_segment_buffer + workgroup_id_x = 5, + 11 padding
24
25; GCN-LABEL: {{^}}minimal_kernel_inputs:
26; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
27; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s0
28; GCN-NEXT: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off
29
30; GCN: .amdhsa_kernel minimal_kernel_inputs
31; WORKAROUND: .amdhsa_user_sgpr_count 15
32; NOWORKAROUND: .amdhsa_user_sgpr_count 0
33; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
34; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
35; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
36; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
37; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
38; GCN-NEXT: .amdhsa_wavefront_size32
39; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
40; GCN-NEXT: .amdhsa_enable_private_segment 0
41; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
42; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
43; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
44; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
45; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
46; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
47; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0
48define amdgpu_kernel void @minimal_kernel_inputs() #0 {
49  %id = call i32 @llvm.amdgcn.workgroup.id.x()
50  store volatile i32 %id, ptr addrspace(1) undef
51  ret void
52}
53
54; GCN-LABEL: {{^}}minimal_kernel_inputs_with_stack:
55; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
56; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s0
57; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off
58
59; GCN: .amdhsa_kernel minimal_kernel_inputs
60; WORKAROUND: .amdhsa_user_sgpr_count 15
61; NOWORKAROUND: .amdhsa_user_sgpr_count 0
62; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
63; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
64; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
65; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
66; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
67; GCN-NEXT: .amdhsa_wavefront_size32
68; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
69; GCN-NEXT: .amdhsa_enable_private_segment 1
70; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
71; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
72; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
73; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
74; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
75; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
76; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0
77define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 {
78  %alloca = alloca i32, addrspace(5)
79  %id = call i32 @llvm.amdgcn.workgroup.id.x()
80  store volatile i32 %id, ptr addrspace(1) undef
81  store volatile i32 0, ptr addrspace(5) %alloca
82  ret void
83}
84
85; GCN-LABEL: {{^}}queue_ptr:
86; GCN: global_load_u8 v{{[0-9]+}},
87
88; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
89; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s4
90; GCN-NEXT: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off
91
92; GCN: .amdhsa_kernel queue_ptr
93; WORKAROUND: .amdhsa_user_sgpr_count 15
94; NOWORKAROUND: .amdhsa_user_sgpr_count 4
95; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
96; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
97; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
98; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
99; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
100; GCN-NEXT: .amdhsa_wavefront_size32
101; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
102; GCN-NEXT: .amdhsa_enable_private_segment 0
103; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
104; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
105; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
106; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
107; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
108; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
109; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 4
110define amdgpu_kernel void @queue_ptr() #1 {
111  %queue.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
112  %load = load volatile i8, ptr addrspace(4) %queue.ptr
113  %id = call i32 @llvm.amdgcn.workgroup.id.x()
114  store volatile i32 %id, ptr addrspace(1) undef
115  ret void
116}
117
118; GCN-LABEL: {{^}}all_inputs:
119; WORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s13
120; WORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s14
121; WORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s15
122
123; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s8
124; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s9
125; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s10
126
127; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
128; GCN: global_load_u8 v{{[0-9]+}},
129; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5]
130
131; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s6
132; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s7
133
134; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_X]], off
135; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Y]], off
136; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Z]], off
137; GCN: global_store_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[DISPATCH_LO]]:[[DISPATCH_HI]]{{\]}}, off
138
139; GCN: .amdhsa_kernel all_inputs
140; WORKAROUND: .amdhsa_user_sgpr_count 13
141; NOWORKAROUND: .amdhsa_user_sgpr_count 8
142; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
143; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
144; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
145; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1
146; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
147; GCN-NEXT: .amdhsa_wavefront_size32
148; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
149; GCN-NEXT: .amdhsa_enable_private_segment 1
150; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
151; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
152; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
153; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
154; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
155; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13
156; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 8
157define amdgpu_kernel void @all_inputs() #2 {
158  %alloca = alloca i32, addrspace(5)
159  store volatile i32 0, ptr addrspace(5) %alloca
160
161  %dispatch.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
162  %load.dispatch = load volatile i8, ptr addrspace(4) %dispatch.ptr
163
164  %queue.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr()
165  %load.queue = load volatile i8, ptr addrspace(4) %queue.ptr
166
167  %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
168  %load.implicitarg = load volatile i8, ptr addrspace(4) %implicitarg.ptr
169
170  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
171  store volatile i32 %id.x, ptr addrspace(1) undef
172
173  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
174  store volatile i32 %id.y, ptr addrspace(1) undef
175
176  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
177  store volatile i32 %id.z, ptr addrspace(1) undef
178
179  %dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
180  store volatile i64 %dispatch.id, ptr addrspace(1) undef
181
182  ret void
183}
184
185declare i32 @llvm.amdgcn.workgroup.id.x() #3
186declare i32 @llvm.amdgcn.workgroup.id.y() #3
187declare i32 @llvm.amdgcn.workgroup.id.z() #3
188declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
189declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #3
190declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr() #3
191declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #3
192declare i64 @llvm.amdgcn.dispatch.id() #3
193
194attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
195attributes #1 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
196attributes #2 = { "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
197attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
198
199!llvm.module.flags = !{!0}
200!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
201