xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll (revision a82032918cd445e5750e171f57d4f3d7096c021a)
1; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
4; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
5; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
6; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
7
8; GCN-LABEL: {{^}}full_mask:
9; GCN: s_mov_b64 exec, -1
10; GCN: v_add_f32_e32 v0,
11define amdgpu_ps float @full_mask(float %a, float %b) {
12main_body:
13  %s = fadd float %a, %b
14  call void @llvm.amdgcn.init.exec(i64 -1)
15  ret float %s
16}
17
18; GCN-LABEL: {{^}}partial_mask:
19; GCN: s_mov_b64 exec, 0x1e240
20; GCN: v_add_f32_e32 v0,
21define amdgpu_ps float @partial_mask(float %a, float %b) {
22main_body:
23  %s = fadd float %a, %b
24  call void @llvm.amdgcn.init.exec(i64 123456)
25  ret float %s
26}
27
28; GCN-LABEL: {{^}}input_s3off8:
29; GCN: s_bfe_u32 s0, s3, 0x70008
30; GCN: s_bfm_b64 exec, s0, 0
31; GCN: s_cmp_eq_u32 s0, 64
32; GCN: s_cmov_b64 exec, -1
33; GCN: v_add_f32_e32 v0,
34define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
35main_body:
36  %s = fadd float %a, %b
37  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
38  ret float %s
39}
40
41; GCN-LABEL: {{^}}input_s0off19:
42; GCN: s_bfe_u32 s0, s0, 0x70013
43; GCN: s_bfm_b64 exec, s0, 0
44; GCN: s_cmp_eq_u32 s0, 64
45; GCN: s_cmov_b64 exec, -1
46; GCN: v_add_f32_e32 v0,
47define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) {
48main_body:
49  %s = fadd float %a, %b
50  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
51  ret float %s
52}
53
54; GCN-LABEL: {{^}}reuse_input:
55; GCN: s_bfe_u32 s1, s0, 0x70013
56; GCN: s_bfm_b64 exec, s1, 0
57; GCN: s_cmp_eq_u32 s1, 64
58; GCN: s_cmov_b64 exec, -1
59; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
60define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) {
61main_body:
62  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
63  %s = add i32 %a, %count
64  %f = sitofp i32 %s to float
65  ret float %f
66}
67
68; GCN-LABEL: {{^}}reuse_input2:
69; GCN: s_bfe_u32 s1, s0, 0x70013
70; GCN: s_bfm_b64 exec, s1, 0
71; GCN: s_cmp_eq_u32 s1, 64
72; GCN: s_cmov_b64 exec, -1
73; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
74define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) {
75main_body:
76  %s = add i32 %a, %count
77  %f = sitofp i32 %s to float
78  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
79  ret float %f
80}
81
82; GCN-LABEL: {{^}}init_unreachable:
83;
84; This used to crash.
85define amdgpu_ps void @init_unreachable() {
86main_body:
87  call void @llvm.amdgcn.init.exec(i64 -1)
88  unreachable
89}
90
91; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
92; GCN-NOT: {{^}}v_
93; GCN: s_mov_b64 exec, -1
94; GCN: v_mov
95; GCN: v_add
96define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
97main_body:
98  %array0 = alloca [1024 x i32], align 16, addrspace(5)
99  %array1 = alloca [20 x i32], align 16, addrspace(5)
100  call void @llvm.amdgcn.init.exec(i64 -1)
101
102  %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
103  store i32 %a, ptr addrspace(5) %ptr0, align 4
104
105  %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
106  store i32 %a, ptr addrspace(5) %ptr1, align 4
107
108  %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
109  store i32 %b, ptr addrspace(5) %ptr2, align 4
110
111  %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
112  %v3 = load i32, ptr addrspace(5) %ptr3, align 4
113
114  %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
115  %v4 = load i32, ptr addrspace(5) %ptr4, align 4
116
117  %v5 = add i32 %v3, %v4
118  %v = bitcast i32 %v5 to float
119  ret float %v
120}
121
122; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
123; GCN-NOT: {{^}}v_
124; GCN: s_bfe_u32 s2, s2, 0x70008
125; GCN-NEXT: s_bfm_b64 exec, s2, 0
126; GCN-NEXT: s_cmp_eq_u32 s2, 64
127; GCN-NEXT: s_cmov_b64 exec, -1
128; GCN: v_mov
129; GCN: v_add
130define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
131main_body:
132  %array0 = alloca [1024 x i32], align 16, addrspace(5)
133  %array1 = alloca [20 x i32], align 16, addrspace(5)
134  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
135
136  %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
137  store i32 %a, ptr addrspace(5) %ptr0, align 4
138
139  %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
140  store i32 %a, ptr addrspace(5) %ptr1, align 4
141
142  %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
143  store i32 %b, ptr addrspace(5) %ptr2, align 4
144
145  %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
146  %v3 = load i32, ptr addrspace(5) %ptr3, align 4
147
148  %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
149  %v4 = load i32, ptr addrspace(5) %ptr4, align 4
150
151  %v5 = add i32 %v3, %v4
152  %v = bitcast i32 %v5 to float
153  ret float %v
154}
155
156; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
157; GCN-NOT: {{^}}v_
158; GCN: %endif
159; GCN: s_bfe_u32 [[S:s[0-9]+]], s2, 0x70008
160; GCN-NEXT: s_bfm_b64 exec, [[S]], 0
161; GCN-NEXT: s_cmp_eq_u32 [[S]], 64
162; GCN-NEXT: s_cmov_b64 exec, -1
163; GCN: v_mov
164; GCN: v_add
165define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
166main_body:
167  ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
168  %array0 = alloca [1024 x i32], align 16, addrspace(5)
169  %array1 = alloca [20 x i32], align 16, addrspace(5)
170
171  %cc = icmp uge i32 %count, 32
172  br i1 %cc, label %endif, label %if
173
174if:
175  call void asm sideeffect "", ""()
176  br label %endif
177
178endif:
179  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
180
181  %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
182  store i32 %a, ptr addrspace(5) %ptr0, align 4
183
184  %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
185  store i32 %a, ptr addrspace(5) %ptr1, align 4
186
187  %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
188  store i32 %b, ptr addrspace(5) %ptr2, align 4
189
190  %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
191  %v3 = load i32, ptr addrspace(5) %ptr3, align 4
192
193  %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
194  %v4 = load i32, ptr addrspace(5) %ptr4, align 4
195
196  %v5 = add i32 %v3, %v4
197  %v6 = add i32 %v5, %count
198  %v = bitcast i32 %v6 to float
199  ret float %v
200}
201
202declare void @llvm.amdgcn.init.exec(i64) #1
203declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
204
205attributes #1 = { convergent }
206