xref: /llvm-project/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll (revision bc6955f18ced3ca89d49bc28eeb58cd6d367e136)
1; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
2; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
3; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
4; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
6; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
7; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
9
10target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
11
12; OPT-LABEL: @test_sink_global_small_offset_i32(
13; OPT-CI-NOT: getelementptr i32, ptr addrspace(1) %in
14; OPT-VI: getelementptr i32, ptr addrspace(1) %in
15; OPT: br i1
16; OPT-CI: getelementptr i8,
17
18; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
19define amdgpu_kernel void @test_sink_global_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
20entry:
21  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
22  %in.gep = getelementptr i32, ptr addrspace(1) %in, i64 7
23  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
24  %tmp0 = icmp eq i32 %tid, 0
25  br i1 %tmp0, label %endif, label %if
26
27if:
28  %tmp1 = load i32, ptr addrspace(1) %in.gep
29  br label %endif
30
31endif:
32  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
33  store i32 %x, ptr addrspace(1) %out.gep
34  br label %done
35
36done:
37  ret void
38}
39
40; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
41; OPT: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 65535
42; OPT: br i1
43
44; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
45; GCN: s_and_saveexec_b64
46; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
47
48; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0xf000{{$}}
49; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
50; GCN: {{^}}.LBB1_2:
51; GCN: s_or_b64 exec
52define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
53entry:
54  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999
55  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 65535
56  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
57  %tmp0 = icmp eq i32 %tid, 0
58  br i1 %tmp0, label %endif, label %if
59
60if:
61  %tmp1 = load i8, ptr addrspace(1) %in.gep
62  %tmp2 = sext i8 %tmp1 to i32
63  br label %endif
64
65endif:
66  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
67  store i32 %x, ptr addrspace(1) %out.gep
68  br label %done
69
70done:
71  ret void
72}
73
74; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
75; GCN: s_and_saveexec_b64
76; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
77; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
78; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
79; GCN: {{^}}.LBB2_2:
80; GCN: s_or_b64 exec
81define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
82entry:
83  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024
84  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 4095
85  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
86  %tmp0 = icmp eq i32 %tid, 0
87  br i1 %tmp0, label %endif, label %if
88
89if:
90  %tmp1 = load i8, ptr addrspace(1) %in.gep
91  %tmp2 = sext i8 %tmp1 to i32
92  br label %endif
93
94endif:
95  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
96  store i32 %x, ptr addrspace(1) %out.gep
97  br label %done
98
99done:
100  ret void
101}
102
103; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
104; GCN: s_and_saveexec_b64
105; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
106; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0x1000{{$}}
107; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]$}}
108; GCN: {{^}}.LBB3_2:
109; GCN: s_or_b64 exec
110define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
111entry:
112  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999
113  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 4096
114  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
115  %tmp0 = icmp eq i32 %tid, 0
116  br i1 %tmp0, label %endif, label %if
117
118if:
119  %tmp1 = load i8, ptr addrspace(1) %in.gep
120  %tmp2 = sext i8 %tmp1 to i32
121  br label %endif
122
123endif:
124  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
125  store i32 %x, ptr addrspace(1) %out.gep
126  br label %done
127
128done:
129  ret void
130}
131
132; OPT-LABEL: @test_sink_scratch_small_offset_i32(
133; OPT-NOT:  getelementptr [512 x i32]
134; OPT: br i1
135; OPT: getelementptr i8,
136
137; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
138; GCN: s_and_saveexec_b64
139; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088{{$}}
140; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088 glc{{$}}
141; GCN: {{^}}.LBB4_2:
142define amdgpu_kernel void @test_sink_scratch_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
143entry:
144  %alloca = alloca [512 x i32], align 4, addrspace(5)
145  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998
146  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999
147  %add.arg = add i32 %arg, 8
148  %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1022
149  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
150  %tmp0 = icmp eq i32 %tid, 0
151  br i1 %tmp0, label %endif, label %if
152
153if:
154  store volatile i32 123, ptr addrspace(5) %alloca.gep
155  %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep
156  br label %endif
157
158endif:
159  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
160  store i32 %x, ptr addrspace(1) %out.gep.0
161  %load = load volatile i32, ptr addrspace(5) %alloca.gep
162  store i32 %load, ptr addrspace(1) %out.gep.1
163  br label %done
164
165done:
166  ret void
167}
168
169; This used to be a special case when the scavenge slot was
170; fixed at offset 0.
171; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
172; OPT-NOT:  getelementptr [512 x i32]
173; OPT: br i1
174; OPT: getelementptr i8,
175
176; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
177; GCN: s_and_saveexec_b64
178; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
179; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}}
180; GCN: {{^.LBB[0-9]+}}_2:
181
182define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
183entry:
184  %alloca = alloca [512 x i32], align 4, addrspace(5)
185  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998
186  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999
187  %add.arg = add i32 %arg, 8
188  %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1023
189  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
190  %tmp0 = icmp eq i32 %tid, 0
191  br i1 %tmp0, label %endif, label %if
192
193if:
194  store volatile i32 123, ptr addrspace(5) %alloca.gep
195  %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep
196  br label %endif
197
198endif:
199  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
200  store i32 %x, ptr addrspace(1) %out.gep.0
201  %load = load volatile i32, ptr addrspace(5) %alloca.gep
202  store i32 %load, ptr addrspace(1) %out.gep.1
203  br label %done
204
205done:
206  ret void
207}
208
209; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
210; OPT: %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1024
211; OPT: br i1
212; OPT-NOT: ptrtoint
213
214; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
215; GCN: s_and_saveexec_b64
216; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
217; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc{{$}}
218; GCN: {{^.LBB[0-9]+}}_2:
219define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
220entry:
221  %alloca = alloca [512 x i32], align 4, addrspace(5)
222  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998
223  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999
224  %add.arg = add i32 %arg, 8
225  %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1024
226  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
227  %tmp0 = icmp eq i32 %tid, 0
228  br i1 %tmp0, label %endif, label %if
229
230if:
231  store volatile i32 123, ptr addrspace(5) %alloca.gep
232  %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep
233  br label %endif
234
235endif:
236  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
237  store i32 %x, ptr addrspace(1) %out.gep.0
238  %load = load volatile i32, ptr addrspace(5) %alloca.gep
239  store i32 %load, ptr addrspace(1) %out.gep.1
240  br label %done
241
242done:
243  ret void
244}
245
246; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
247; GCN: s_and_saveexec_b64
248; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
249; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
250; GCN: {{^.LBB[0-9]+}}_2:
251define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %offset) {
252entry:
253  %offset.ext = zext i32 %offset to i64
254  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
255  %in.gep = getelementptr i32, ptr addrspace(1) %in, i64 %offset.ext
256  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
257  %tmp0 = icmp eq i32 %tid, 0
258  br i1 %tmp0, label %endif, label %if
259
260if:
261  %tmp1 = load i32, ptr addrspace(1) %in.gep
262  br label %endif
263
264endif:
265  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
266  store i32 %x, ptr addrspace(1) %out.gep
267  br label %done
268
269done:
270  ret void
271}
272
273; OPT-LABEL: @test_sink_constant_small_offset_i32
274; OPT-NOT:  getelementptr i32, ptr addrspace(4)
275; OPT: br i1
276
277; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
278; GCN: s_and_saveexec_b64
279; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
280; GCN: s_or_b64 exec, exec
281define amdgpu_kernel void @test_sink_constant_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
282entry:
283  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
284  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 7
285  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
286  %tmp0 = icmp eq i32 %tid, 0
287  br i1 %tmp0, label %endif, label %if
288
289if:
290  %tmp1 = load i32, ptr addrspace(4) %in.gep
291  br label %endif
292
293endif:
294  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
295  store i32 %x, ptr addrspace(1) %out.gep
296  br label %done
297
298done:
299  ret void
300}
301
302; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
303; OPT-NOT:  getelementptr i32, ptr addrspace(4)
304; OPT: br i1
305
306; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
307; GCN: s_and_saveexec_b64
308; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
309; GCN: s_or_b64 exec, exec
310define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
311entry:
312  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
313  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 255
314  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
315  %tmp0 = icmp eq i32 %tid, 0
316  br i1 %tmp0, label %endif, label %if
317
318if:
319  %tmp1 = load i32, ptr addrspace(4) %in.gep
320  br label %endif
321
322endif:
323  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
324  store i32 %x, ptr addrspace(1) %out.gep
325  br label %done
326
327done:
328  ret void
329}
330
331; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
332; OPT-SI:  getelementptr i32, ptr addrspace(4)
333; OPT-CI-NOT:  getelementptr i32, ptr addrspace(4)
334; OPT-VI-NOT:  getelementptr i32, ptr addrspace(4)
335; OPT: br i1
336
337; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
338; GCN: s_and_saveexec_b64
339; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400
340
341; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
342; GCN: s_or_b64 exec, exec
343define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
344entry:
345  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
346  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 256
347  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
348  %tmp0 = icmp eq i32 %tid, 0
349  br i1 %tmp0, label %endif, label %if
350
351if:
352  %tmp1 = load i32, ptr addrspace(4) %in.gep
353  br label %endif
354
355endif:
356  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
357  store i32 %x, ptr addrspace(1) %out.gep
358  br label %done
359
360done:
361  ret void
362}
363
364; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
365; OPT-SI: getelementptr i32, ptr addrspace(4)
366; OPT-CI-NOT: getelementptr i32, ptr addrspace(4)
367; OPT: br i1
368
369; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
370; GCN: s_and_saveexec_b64
371; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}}
372; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
373; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
374
375; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}}
376; VI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
377; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
378
379; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffffff{{$}}
380
381; GCN: s_or_b64 exec, exec
382define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
383entry:
384  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
385  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 4294967295
386  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
387  %tmp0 = icmp eq i32 %tid, 0
388  br i1 %tmp0, label %endif, label %if
389
390if:
391  %tmp1 = load i32, ptr addrspace(4) %in.gep
392  br label %endif
393
394endif:
395  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
396  store i32 %x, ptr addrspace(1) %out.gep
397  br label %done
398
399done:
400  ret void
401}
402
403; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
404; OPT: getelementptr i32, ptr addrspace(4)
405; OPT: br i1
406
407; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
408; GCN: s_and_saveexec_b64
409; GCN: s_add_u32
410; GCN: s_addc_u32
411; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
412; GCN: s_or_b64 exec, exec
413define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
414entry:
415  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
416  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 17179869181
417  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
418  %tmp0 = icmp eq i32 %tid, 0
419  br i1 %tmp0, label %endif, label %if
420
421if:
422  %tmp1 = load i32, ptr addrspace(4) %in.gep
423  br label %endif
424
425endif:
426  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
427  store i32 %x, ptr addrspace(1) %out.gep
428  br label %done
429
430done:
431  ret void
432}
433
434; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32:
435; GCN: s_and_saveexec_b64
436; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}}
437; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
438
439; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}}
440; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
441
442; GCN: s_or_b64 exec, exec
443define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
444entry:
445  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
446  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 262143
447  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
448  %tmp0 = icmp eq i32 %tid, 0
449  br i1 %tmp0, label %endif, label %if
450
451if:
452  %tmp1 = load i32, ptr addrspace(4) %in.gep
453  br label %endif
454
455endif:
456  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
457  store i32 %x, ptr addrspace(1) %out.gep
458  br label %done
459
460done:
461  ret void
462}
463
464; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
465; OPT-SI: getelementptr i32, ptr addrspace(4)
466; OPT-CI-NOT: getelementptr i32, ptr addrspace(4)
467; OPT-VI: getelementptr i32, ptr addrspace(4)
468; OPT: br i1
469
470; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
471; GCN: s_and_saveexec_b64
472; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
473; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
474
475; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}}
476
477; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
478; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
479
480; GCN: s_or_b64 exec, exec
481define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
482entry:
483  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
484  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 262144
485  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
486  %tmp0 = icmp eq i32 %tid, 0
487  br i1 %tmp0, label %endif, label %if
488
489if:
490  %tmp1 = load i32, ptr addrspace(4) %in.gep
491  br label %endif
492
493endif:
494  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
495  store i32 %x, ptr addrspace(1) %out.gep
496  br label %done
497
498done:
499  ret void
500}
501
502%struct.foo = type { [3 x float], [3 x float] }
503
504; OPT-LABEL: @sink_ds_address(
505; OPT: getelementptr inbounds i8,
506
507; GCN-LABEL: {{^}}sink_ds_address:
508; GCN: s_load_dword [[SREG1:s[0-9]+]],
509; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
510; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
511define amdgpu_kernel void @sink_ds_address(ptr addrspace(3) nocapture %ptr) nounwind {
512entry:
513  %x = getelementptr inbounds %struct.foo, ptr addrspace(3) %ptr, i32 0, i32 1, i32 0
514  %y = getelementptr inbounds %struct.foo, ptr addrspace(3) %ptr, i32 0, i32 1, i32 2
515  br label %bb32
516
517bb32:
518  %a = load float, ptr addrspace(3) %x, align 4
519  %b = load float, ptr addrspace(3) %y, align 4
520  %cmp = fcmp one float %a, %b
521  br i1 %cmp, label %bb34, label %bb33
522
523bb33:
524  unreachable
525
526bb34:
527  unreachable
528}
529
530; Address offset is not a multiple of 4. This is a valid mubuf offset,
531; but not smrd.
532
533; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(
534; OPT: br i1 %tmp0,
535; OPT: if:
536; OPT: getelementptr i8, {{.*}} 4095
537define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(ptr addrspace(1) %out, ptr addrspace(4) %in) {
538entry:
539  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024
540  %in.gep = getelementptr i8, ptr addrspace(4) %in, i64 4095
541  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
542  %tmp0 = icmp eq i32 %tid, 0
543  br i1 %tmp0, label %endif, label %if
544
545if:
546  %tmp1 = load i32, ptr addrspace(4) %in.gep, align 1
547  br label %endif
548
549endif:
550  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
551  store i32 %x, ptr addrspace(1) %out.gep
552  br label %done
553
554done:
555  ret void
556}
557
558; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32(
559; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
560; OPT: %tmp1 = atomicrmw add ptr addrspace(3) %sunkaddr, i32 2 seq_cst
561define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
562entry:
563  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
564  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
565  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
566  %tmp0 = icmp eq i32 %tid, 0
567  br i1 %tmp0, label %endif, label %if
568
569if:
570  %tmp1 = atomicrmw add ptr addrspace(3) %in.gep, i32 2 seq_cst
571  br label %endif
572
573endif:
574  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
575  store i32 %x, ptr addrspace(3) %out.gep
576  br label %done
577
578done:
579  ret void
580}
581
582; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32(
583; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
584; OPT: %tmp1.struct = cmpxchg ptr addrspace(3) %sunkaddr, i32 undef, i32 2 seq_cst monotonic
585define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
586entry:
587  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
588  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
589  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
590  %tmp0 = icmp eq i32 %tid, 0
591  br i1 %tmp0, label %endif, label %if
592
593if:
594  %tmp1.struct = cmpxchg ptr addrspace(3) %in.gep, i32 undef, i32 2 seq_cst monotonic
595  %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0
596  br label %endif
597
598endif:
599  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
600  store i32 %x, ptr addrspace(3) %out.gep
601  br label %done
602
603done:
604  ret void
605}
606
607; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32(
608; OPT: %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
609; OPT: br i1
610; OPT: cmpxchg ptr addrspace(3) undef, ptr addrspace(3) %in.gep, ptr addrspace(3) undef seq_cst monotonic
611define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
612entry:
613  %out.gep = getelementptr ptr addrspace(3), ptr addrspace(3) %out, i32 999999
614  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
615  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
616  %tmp0 = icmp eq i32 %tid, 0
617  br i1 %tmp0, label %endif, label %if
618
619if:
620  %tmp1.struct = cmpxchg ptr addrspace(3) undef, ptr addrspace(3) %in.gep, ptr addrspace(3) undef seq_cst monotonic
621  %tmp1 = extractvalue { ptr addrspace(3), i1 } %tmp1.struct, 0
622  br label %endif
623
624endif:
625  %x = phi ptr addrspace(3) [ %tmp1, %if ], [ null, %entry ]
626  store ptr addrspace(3) %x, ptr addrspace(3) %out.gep
627  br label %done
628
629done:
630  ret void
631}
632
633; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
634; OPT-SICIVI: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096
635; OPT-SICIV: br
636; OPT-SICIVI: %tmp1 = load i8, ptr addrspace(1) %in.gep
637
638; OPT-GFX9: br
639; OPT-GFX9: %sunkaddr = getelementptr i8, ptr addrspace(1) %in, i64 -4096
640; OPT-GFX9: load i8, ptr addrspace(1) %sunkaddr
641
642; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
643; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
644; GFX9: global_load_sbyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}}
645define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
646entry:
647  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024
648  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096
649  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
650  %tmp0 = icmp eq i32 %tid, 0
651  br i1 %tmp0, label %endif, label %if
652
653if:
654  %tmp1 = load i8, ptr addrspace(1) %in.gep
655  %tmp2 = sext i8 %tmp1 to i32
656  br label %endif
657
658endif:
659  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
660  store i32 %x, ptr addrspace(1) %out.gep
661  br label %done
662
663done:
664  ret void
665}
666
667; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
668; OPT: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4097
669; OPT: br
670; OPT: load i8, ptr addrspace(1) %in.gep
671
672; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
673define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
674entry:
675  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999
676  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4097
677  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
678  %tmp0 = icmp eq i32 %tid, 0
679  br i1 %tmp0, label %endif, label %if
680
681if:
682  %tmp1 = load i8, ptr addrspace(1) %in.gep
683  %tmp2 = sext i8 %tmp1 to i32
684  br label %endif
685
686endif:
687  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
688  store i32 %x, ptr addrspace(1) %out.gep
689  br label %done
690
691done:
692  ret void
693}
694
695; OPT-LABEL: @test_sink_small_offset_ds_append(
696; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
697; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %sunkaddr, i1 false)
698define amdgpu_kernel void @test_sink_small_offset_ds_append(ptr addrspace(3) %out, ptr addrspace(3) %in) {
699entry:
700  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
701  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
702  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
703  %tmp0 = icmp eq i32 %tid, 0
704  br i1 %tmp0, label %endif, label %if
705
706if:
707  %tmp1 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %in.gep, i1 false)
708  br label %endif
709
710endif:
711  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
712  store i32 %x, ptr addrspace(3) %out.gep
713  br label %done
714
715done:
716  ret void
717}
718
719; OPT-LABEL: @test_sink_small_offset_ds_consume(
720; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
721; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %sunkaddr, i1 false)
722define amdgpu_kernel void @test_sink_small_offset_ds_consume(ptr addrspace(3) %out, ptr addrspace(3) %in) {
723entry:
724  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
725  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
726  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
727  %tmp0 = icmp eq i32 %tid, 0
728  br i1 %tmp0, label %endif, label %if
729
730if:
731  %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %in.gep, i1 false)
732  br label %endif
733
734endif:
735  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
736  store i32 %x, ptr addrspace(3) %out.gep
737  br label %done
738
739done:
740  ret void
741}
742
743declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
744declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture, i1 immarg) #3
745declare i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) nocapture, i1 immarg) #3
746
747attributes #0 = { nounwind readnone }
748attributes #1 = { nounwind }
749attributes #2 = { nounwind argmemonly }
750attributes #3 = { argmemonly convergent nounwind willreturn }
751