xref: /llvm-project/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl (revision 3a29dfe37c585355dc70c7c614f5bbf071cd7efb)
1// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s
3
4// REQUIRES: amdgpu-registered-target
5
6typedef unsigned int uint;
7
8// CHECK-LABEL: @test_s_sleep_var(
9// CHECK-NEXT:  entry:
10// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
11// CHECK-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
12// CHECK-NEXT:    store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
13// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
14// CHECK-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[TMP0]])
15// CHECK-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 15)
16// CHECK-NEXT:    ret void
17//
18void test_s_sleep_var(int d)
19{
20  __builtin_amdgcn_s_sleep_var(d);
21  __builtin_amdgcn_s_sleep_var(15);
22}
23
24// CHECK-LABEL: @test_permlane16_var(
25// CHECK-NEXT:  entry:
26// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
27// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
28// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
29// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
30// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
31// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
32// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
33// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
34// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
35// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
36// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
37// CHECK-NEXT:    store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
38// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
39// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
40// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
41// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false)
42// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
43// CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
44// CHECK-NEXT:    ret void
45//
46void test_permlane16_var(global uint* out, uint a, uint b, uint c) {
47  *out = __builtin_amdgcn_permlane16_var(a, b, c, 0, 0);
48}
49
50// CHECK-LABEL: @test_permlanex16_var(
51// CHECK-NEXT:  entry:
52// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
53// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
54// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
55// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
56// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
57// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
58// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
59// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
60// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
61// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
62// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
63// CHECK-NEXT:    store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
64// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
65// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
66// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
67// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlanex16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false)
68// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
69// CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
70// CHECK-NEXT:    ret void
71//
72void test_permlanex16_var(global uint* out, uint a, uint b, uint c) {
73  *out = __builtin_amdgcn_permlanex16_var(a, b, c, 0, 0);
74}
75
76// CHECK-LABEL: @test_s_barrier_signal(
77// CHECK-NEXT:  entry:
78// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -1)
79// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -1)
80// CHECK-NEXT:    ret void
81//
82void test_s_barrier_signal()
83{
84  __builtin_amdgcn_s_barrier_signal(-1);
85  __builtin_amdgcn_s_barrier_wait(-1);
86}
87
88// CHECK-LABEL: @test_s_barrier_signal_var(
89// CHECK-NEXT:  entry:
90// CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
91// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
92// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
93// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
94// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
95// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
96// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
97// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
98// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
99// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
100// CHECK-NEXT:    ret void
101//
102void test_s_barrier_signal_var(void *bar, int a)
103{
104  __builtin_amdgcn_s_barrier_signal_var(bar, a);
105}
106
107// CHECK-LABEL: @test_s_barrier_signal_isfirst(
108// CHECK-NEXT:  entry:
109// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
110// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
111// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
112// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
113// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
114// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
115// CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
116// CHECK-NEXT:    store ptr [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8
117// CHECK-NEXT:    store ptr [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8
118// CHECK-NEXT:    [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
119// CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
120// CHECK:       if.then:
121// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
122// CHECK-NEXT:    store ptr [[TMP1]], ptr [[A_ADDR_ASCAST]], align 8
123// CHECK-NEXT:    br label [[IF_END:%.*]]
124// CHECK:       if.else:
125// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
126// CHECK-NEXT:    store ptr [[TMP2]], ptr [[A_ADDR_ASCAST]], align 8
127// CHECK-NEXT:    br label [[IF_END]]
128// CHECK:       if.end:
129// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
130// CHECK-NEXT:    ret void
131//
132void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
133{
134  if(__builtin_amdgcn_s_barrier_signal_isfirst(1))
135    a = b;
136  else
137    a = c;
138
139  __builtin_amdgcn_s_barrier_wait(1);
140}
141
142// CHECK-LABEL: @test_s_barrier_init(
143// CHECK-NEXT:  entry:
144// CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
145// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
146// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
147// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
148// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
149// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
150// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
151// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
152// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
153// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
154// CHECK-NEXT:    ret void
155//
156void test_s_barrier_init(void *bar, int a)
157{
158  __builtin_amdgcn_s_barrier_init(bar, a);
159}
160
161// CHECK-LABEL: @test_s_barrier_join(
162// CHECK-NEXT:  entry:
163// CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
164// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
165// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
166// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
167// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
168// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
169// CHECK-NEXT:    ret void
170//
171void test_s_barrier_join(void *bar)
172{
173  __builtin_amdgcn_s_barrier_join(bar);
174}
175
176// CHECK-LABEL: @test_s_barrier_leave(
177// CHECK-NEXT:  entry:
178// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.leave(i16 1)
179// CHECK-NEXT:    ret void
180//
181void test_s_barrier_leave()
182{
183  __builtin_amdgcn_s_barrier_leave(1);
184}
185
186// CHECK-LABEL: @test_s_get_barrier_state(
187// CHECK-NEXT:  entry:
188// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
189// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
190// CHECK-NEXT:    [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
191// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
192// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
193// CHECK-NEXT:    [[STATE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STATE]] to ptr
194// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
195// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
196// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]])
197// CHECK-NEXT:    store i32 [[TMP1]], ptr [[STATE_ASCAST]], align 4
198// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[STATE_ASCAST]], align 4
199// CHECK-NEXT:    ret i32 [[TMP2]]
200//
201unsigned test_s_get_barrier_state(int a)
202{
203  unsigned State = __builtin_amdgcn_s_get_barrier_state(a);
204  return State;
205}
206
207// CHECK-LABEL: @test_s_get_named_barrier_state(
208// CHECK-NEXT:  entry:
209// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
210// CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
211// CHECK-NEXT:    [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
212// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
213// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
214// CHECK-NEXT:    [[STATE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STATE]] to ptr
215// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
216// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
217// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
218// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]])
219// CHECK-NEXT:    store i32 [[TMP2]], ptr [[STATE_ASCAST]], align 4
220// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STATE_ASCAST]], align 4
221// CHECK-NEXT:    ret i32 [[TMP3]]
222//
223unsigned test_s_get_named_barrier_state(void *bar)
224{
225  unsigned State = __builtin_amdgcn_s_get_named_barrier_state(bar);
226  return State;
227}
228
229// CHECK-LABEL: @test_s_ttracedata(
230// CHECK-NEXT:  entry:
231// CHECK-NEXT:    call void @llvm.amdgcn.s.ttracedata(i32 1)
232// CHECK-NEXT:    ret void
233//
234void test_s_ttracedata()
235{
236  __builtin_amdgcn_s_ttracedata(1);
237}
238
239// CHECK-LABEL: @test_s_ttracedata_imm(
240// CHECK-NEXT:  entry:
241// CHECK-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
242// CHECK-NEXT:    ret void
243//
244void test_s_ttracedata_imm()
245{
246  __builtin_amdgcn_s_ttracedata_imm(1);
247}
248
249// CHECK-LABEL: @test_s_prefetch_data(
250// CHECK-NEXT:  entry:
251// CHECK-NEXT:    [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
252// CHECK-NEXT:    [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
253// CHECK-NEXT:    [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5)
254// CHECK-NEXT:    [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
255// CHECK-NEXT:    [[FP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FP_ADDR]] to ptr
256// CHECK-NEXT:    [[GP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GP_ADDR]] to ptr
257// CHECK-NEXT:    [[CP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CP_ADDR]] to ptr
258// CHECK-NEXT:    [[LEN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LEN_ADDR]] to ptr
259// CHECK-NEXT:    store ptr [[FP:%.*]], ptr [[FP_ADDR_ASCAST]], align 8
260// CHECK-NEXT:    store ptr addrspace(1) [[GP:%.*]], ptr [[GP_ADDR_ASCAST]], align 8
261// CHECK-NEXT:    store ptr addrspace(4) [[CP:%.*]], ptr [[CP_ADDR_ASCAST]], align 8
262// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr [[LEN_ADDR_ASCAST]], align 4
263// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[FP_ADDR_ASCAST]], align 8
264// CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0)
265// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GP_ADDR_ASCAST]], align 8
266// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[LEN_ADDR_ASCAST]], align 4
267// CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]])
268// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr [[CP_ADDR_ASCAST]], align 8
269// CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31)
270// CHECK-NEXT:    ret void
271//
272void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned int len)
273{
274  __builtin_amdgcn_s_prefetch_data(fp, 0);
275  __builtin_amdgcn_s_prefetch_data(gp, len);
276  __builtin_amdgcn_s_prefetch_data(cp, 31);
277}
278
279// CHECK-LABEL: @test_s_buffer_prefetch_data(
280// CHECK-NEXT:  entry:
281// CHECK-NEXT:    [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
282// CHECK-NEXT:    [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
283// CHECK-NEXT:    [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
284// CHECK-NEXT:    [[LEN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LEN_ADDR]] to ptr
285// CHECK-NEXT:    store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
286// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr [[LEN_ADDR_ASCAST]], align 4
287// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
288// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[LEN_ADDR_ASCAST]], align 4
289// CHECK-NEXT:    call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP0]], i32 128, i32 [[TMP1]])
290// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
291// CHECK-NEXT:    call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP2]], i32 0, i32 31)
292// CHECK-NEXT:    ret void
293//
294void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int len)
295{
296  __builtin_amdgcn_s_buffer_prefetch_data(rsrc, 128, len);
297  __builtin_amdgcn_s_buffer_prefetch_data(rsrc, 0, 31);
298}
299
300// CHECK-LABEL: @test_ds_bpermute_fi_b32(
301// CHECK-NEXT:  entry:
302// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
303// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
304// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
305// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
306// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
307// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
308// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
309// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
310// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
311// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
312// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
313// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 [[TMP0]], i32 [[TMP1]])
314// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
315// CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
316// CHECK-NEXT:    ret void
317//
318void test_ds_bpermute_fi_b32(global int* out, int a, int b)
319{
320  *out = __builtin_amdgcn_ds_bpermute_fi_b32(a, b);
321}
322