xref: /llvm-project/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll (revision 29441e4f5fa5f5c7709f7cf180815ba97f611297)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
3; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
4
5%struct.uint4 = type { i32, i32, i32, i32 }
6
7@gi = dso_local addrspace(1) externally_initialized global %struct.uint4 { i32 50462976, i32 117835012, i32 185207048, i32 252579084 }, align 16
8
9; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none)
10; Regular functions mus still make a copy. `cvta.param` does not always work there.
11define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly byval(%struct.uint4) align 16 %a, i1 noundef zeroext %b, i32 noundef %c) local_unnamed_addr #0 {
12; OPT-LABEL: define dso_local noundef i32 @non_kernel_function(
13; OPT-SAME: ptr noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 captures(none) [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
14; OPT-NEXT:  [[ENTRY:.*:]]
15; OPT-NEXT:    [[A1:%.*]] = alloca [[STRUCT_UINT4]], align 16
16; OPT-NEXT:    [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(101)
17; OPT-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 16 [[A1]], ptr addrspace(101) align 16 [[A2]], i64 16, i1 false)
18; OPT-NEXT:    [[A_:%.*]] = select i1 [[B]], ptr [[A1]], ptr addrspacecast (ptr addrspace(1) @gi to ptr)
19; OPT-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[C]] to i64
20; OPT-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]]
21; OPT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1
22; OPT-NEXT:    ret i32 [[TMP0]]
23;
24; PTX-LABEL: non_kernel_function(
25; PTX:       {
26; PTX-NEXT:    .local .align 16 .b8 __local_depot0[16];
27; PTX-NEXT:    .reg .b64 %SP;
28; PTX-NEXT:    .reg .b64 %SPL;
29; PTX-NEXT:    .reg .pred %p<2>;
30; PTX-NEXT:    .reg .b16 %rs<3>;
31; PTX-NEXT:    .reg .b32 %r<11>;
32; PTX-NEXT:    .reg .b64 %rd<9>;
33; PTX-EMPTY:
34; PTX-NEXT:  // %bb.0: // %entry
35; PTX-NEXT:    mov.u64 %SPL, __local_depot0;
36; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
37; PTX-NEXT:    ld.param.u8 %rs1, [non_kernel_function_param_1];
38; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
39; PTX-NEXT:    setp.eq.b16 %p1, %rs2, 1;
40; PTX-NEXT:    ld.param.s32 %rd1, [non_kernel_function_param_2];
41; PTX-NEXT:    ld.param.u64 %rd2, [non_kernel_function_param_0+8];
42; PTX-NEXT:    st.u64 [%SP+8], %rd2;
43; PTX-NEXT:    ld.param.u64 %rd3, [non_kernel_function_param_0];
44; PTX-NEXT:    st.u64 [%SP], %rd3;
45; PTX-NEXT:    mov.u64 %rd4, gi;
46; PTX-NEXT:    cvta.global.u64 %rd5, %rd4;
47; PTX-NEXT:    add.u64 %rd6, %SP, 0;
48; PTX-NEXT:    selp.b64 %rd7, %rd6, %rd5, %p1;
49; PTX-NEXT:    add.s64 %rd8, %rd7, %rd1;
50; PTX-NEXT:    ld.u8 %r1, [%rd8];
51; PTX-NEXT:    ld.u8 %r2, [%rd8+1];
52; PTX-NEXT:    shl.b32 %r3, %r2, 8;
53; PTX-NEXT:    or.b32 %r4, %r3, %r1;
54; PTX-NEXT:    ld.u8 %r5, [%rd8+2];
55; PTX-NEXT:    shl.b32 %r6, %r5, 16;
56; PTX-NEXT:    ld.u8 %r7, [%rd8+3];
57; PTX-NEXT:    shl.b32 %r8, %r7, 24;
58; PTX-NEXT:    or.b32 %r9, %r8, %r6;
59; PTX-NEXT:    or.b32 %r10, %r9, %r4;
60; PTX-NEXT:    st.param.b32 [func_retval0], %r10;
61; PTX-NEXT:    ret;
62entry:
63  %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr), !dbg !17
64  %idx.ext = sext i32 %c to i64, !dbg !18
65  %add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext, !dbg !18
66  %0 = load i32, ptr %add.ptr, align 1, !dbg !19
67  ret i32 %0, !dbg !23
68}
69
70define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
71; PTX-LABEL: grid_const_int(
72; PTX:       {
73; PTX-NEXT:    .reg .b32 %r<4>;
74; PTX-NEXT:    .reg .b64 %rd<3>;
75; PTX-EMPTY:
76; PTX-NEXT:  // %bb.0:
77; PTX-NEXT:    ld.param.u64 %rd1, [grid_const_int_param_2];
78; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
79; PTX-NEXT:    ld.param.u32 %r1, [grid_const_int_param_1];
80; PTX-NEXT:    ld.param.u32 %r2, [grid_const_int_param_0];
81; PTX-NEXT:    add.s32 %r3, %r2, %r1;
82; PTX-NEXT:    st.global.u32 [%rd2], %r3;
83; PTX-NEXT:    ret;
84; OPT-LABEL: define ptx_kernel void @grid_const_int(
85; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
86; OPT-NEXT:    [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
87; OPT-NEXT:    [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
88; OPT-NEXT:    [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
89; OPT-NEXT:    [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
90; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
91; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT3]], align 4
92; OPT-NEXT:    ret void
93  %tmp = load i32, ptr %input1, align 4
94  %add = add i32 %tmp, %input2
95  store i32 %add, ptr %out
96  ret void
97}
98
99%struct.s = type { i32, i32 }
100
101define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
102; PTX-LABEL: grid_const_struct(
103; PTX:       {
104; PTX-NEXT:    .reg .b32 %r<4>;
105; PTX-NEXT:    .reg .b64 %rd<3>;
106; PTX-EMPTY:
107; PTX-NEXT:  // %bb.0:
108; PTX-NEXT:    ld.param.u64 %rd1, [grid_const_struct_param_1];
109; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
110; PTX-NEXT:    ld.param.u32 %r1, [grid_const_struct_param_0];
111; PTX-NEXT:    ld.param.u32 %r2, [grid_const_struct_param_0+4];
112; PTX-NEXT:    add.s32 %r3, %r1, %r2;
113; PTX-NEXT:    st.global.u32 [%rd2], %r3;
114; PTX-NEXT:    ret;
115; OPT-LABEL: define ptx_kernel void @grid_const_struct(
116; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
117; OPT-NEXT:    [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
118; OPT-NEXT:    [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr
119; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
120; OPT-NEXT:    [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
121; OPT-NEXT:    [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
122; OPT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
123; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
124; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
125; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT5]], align 4
126; OPT-NEXT:    ret void
127  %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
128  %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
129  %int1 = load i32, ptr %gep1
130  %int2 = load i32, ptr %gep2
131  %add = add i32 %int1, %int2
132  store i32 %add, ptr %out
133  ret void
134}
135
136define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
137; PTX-LABEL: grid_const_escape(
138; PTX:       {
139; PTX-NEXT:    .reg .b32 %r<3>;
140; PTX-NEXT:    .reg .b64 %rd<5>;
141; PTX-EMPTY:
142; PTX-NEXT:  // %bb.0:
143; PTX-NEXT:    mov.b64 %rd2, grid_const_escape_param_0;
144; PTX-NEXT:    mov.u64 %rd3, %rd2;
145; PTX-NEXT:    cvta.param.u64 %rd4, %rd3;
146; PTX-NEXT:    mov.u64 %rd1, escape;
147; PTX-NEXT:    { // callseq 0, 0
148; PTX-NEXT:    .param .b64 param0;
149; PTX-NEXT:    st.param.b64 [param0], %rd4;
150; PTX-NEXT:    .param .b32 retval0;
151; PTX-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _);
152; PTX-NEXT:    call (retval0),
153; PTX-NEXT:    %rd1,
154; PTX-NEXT:    (
155; PTX-NEXT:    param0
156; PTX-NEXT:    )
157; PTX-NEXT:    , prototype_0;
158; PTX-NEXT:    ld.param.b32 %r1, [retval0];
159; PTX-NEXT:    } // callseq 0
160; PTX-NEXT:    ret;
161; OPT-LABEL: define ptx_kernel void @grid_const_escape(
162; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
163; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
164; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
165; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
166; OPT-NEXT:    ret void
167  %call = call i32 @escape(ptr %input)
168  ret void
169}
170
171define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
172; PTX-LABEL: multiple_grid_const_escape(
173; PTX:       {
174; PTX-NEXT:    .local .align 4 .b8 __local_depot4[4];
175; PTX-NEXT:    .reg .b64 %SP;
176; PTX-NEXT:    .reg .b64 %SPL;
177; PTX-NEXT:    .reg .b32 %r<4>;
178; PTX-NEXT:    .reg .b64 %rd<10>;
179; PTX-EMPTY:
180; PTX-NEXT:  // %bb.0:
181; PTX-NEXT:    mov.u64 %SPL, __local_depot4;
182; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
183; PTX-NEXT:    mov.b64 %rd2, multiple_grid_const_escape_param_0;
184; PTX-NEXT:    mov.b64 %rd3, multiple_grid_const_escape_param_2;
185; PTX-NEXT:    mov.u64 %rd4, %rd3;
186; PTX-NEXT:    ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
187; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
188; PTX-NEXT:    mov.u64 %rd6, %rd2;
189; PTX-NEXT:    cvta.param.u64 %rd7, %rd6;
190; PTX-NEXT:    add.u64 %rd8, %SP, 0;
191; PTX-NEXT:    add.u64 %rd9, %SPL, 0;
192; PTX-NEXT:    st.local.u32 [%rd9], %r1;
193; PTX-NEXT:    mov.u64 %rd1, escape3;
194; PTX-NEXT:    { // callseq 1, 0
195; PTX-NEXT:    .param .b64 param0;
196; PTX-NEXT:    st.param.b64 [param0], %rd7;
197; PTX-NEXT:    .param .b64 param1;
198; PTX-NEXT:    st.param.b64 [param1], %rd8;
199; PTX-NEXT:    .param .b64 param2;
200; PTX-NEXT:    st.param.b64 [param2], %rd5;
201; PTX-NEXT:    .param .b32 retval0;
202; PTX-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _);
203; PTX-NEXT:    call (retval0),
204; PTX-NEXT:    %rd1,
205; PTX-NEXT:    (
206; PTX-NEXT:    param0,
207; PTX-NEXT:    param1,
208; PTX-NEXT:    param2
209; PTX-NEXT:    )
210; PTX-NEXT:    , prototype_1;
211; PTX-NEXT:    ld.param.b32 %r2, [retval0];
212; PTX-NEXT:    } // callseq 1
213; PTX-NEXT:    ret;
214; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
215; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
216; OPT-NEXT:    [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
217; OPT-NEXT:    [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
218; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
219; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
220; OPT-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
221; OPT-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
222; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
223; OPT-NEXT:    ret void
224  %a.addr = alloca i32, align 4
225  store i32 %a, ptr %a.addr, align 4
226  %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
227  ret void
228}
229
230define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
231; PTX-LABEL: grid_const_memory_escape(
232; PTX:       {
233; PTX-NEXT:    .reg .b64 %rd<6>;
234; PTX-EMPTY:
235; PTX-NEXT:  // %bb.0:
236; PTX-NEXT:    mov.b64 %rd1, grid_const_memory_escape_param_0;
237; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_memory_escape_param_1];
238; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
239; PTX-NEXT:    mov.u64 %rd4, %rd1;
240; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
241; PTX-NEXT:    st.global.u64 [%rd3], %rd5;
242; PTX-NEXT:    ret;
243; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
244; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
245; OPT-NEXT:    [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
246; OPT-NEXT:    [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr
247; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
248; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
249; OPT-NEXT:    store ptr [[INPUT1]], ptr [[ADDR5]], align 8
250; OPT-NEXT:    ret void
251  store ptr %input, ptr %addr, align 8
252  ret void
253}
254
255define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
256; PTX-LABEL: grid_const_inlineasm_escape(
257; PTX:       {
258; PTX-NEXT:    .reg .b64 %rd<8>;
259; PTX-EMPTY:
260; PTX-NEXT:  // %bb.0:
261; PTX-NEXT:    mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
262; PTX-NEXT:    ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1];
263; PTX-NEXT:    cvta.to.global.u64 %rd6, %rd5;
264; PTX-NEXT:    mov.u64 %rd7, %rd4;
265; PTX-NEXT:    cvta.param.u64 %rd2, %rd7;
266; PTX-NEXT:    add.s64 %rd3, %rd2, 4;
267; PTX-NEXT:    // begin inline asm
268; PTX-NEXT:    add.s64 %rd1, %rd2, %rd3;
269; PTX-NEXT:    // end inline asm
270; PTX-NEXT:    st.global.u64 [%rd6], %rd1;
271; PTX-NEXT:    ret;
272; PTX-NOT      .local
273; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
274; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
275; OPT-NEXT:    [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1)
276; OPT-NEXT:    [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr
277; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
278; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
279; OPT-NEXT:    [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
280; OPT-NEXT:    [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
281; OPT-NEXT:    [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
282; OPT-NEXT:    store i64 [[TMP2]], ptr [[RESULT5]], align 8
283; OPT-NEXT:    ret void
284  %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
285  %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
286  %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
287  store i64 %1, ptr %result, align 8
288  ret void
289}
290
291define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
292; PTX-LABEL: grid_const_partial_escape(
293; PTX:       {
294; PTX-NEXT:    .reg .b32 %r<5>;
295; PTX-NEXT:    .reg .b64 %rd<7>;
296; PTX-EMPTY:
297; PTX-NEXT:  // %bb.0:
298; PTX-NEXT:    mov.b64 %rd2, grid_const_partial_escape_param_0;
299; PTX-NEXT:    ld.param.u64 %rd3, [grid_const_partial_escape_param_1];
300; PTX-NEXT:    cvta.to.global.u64 %rd4, %rd3;
301; PTX-NEXT:    mov.u64 %rd5, %rd2;
302; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
303; PTX-NEXT:    ld.u32 %r1, [%rd6];
304; PTX-NEXT:    add.s32 %r2, %r1, %r1;
305; PTX-NEXT:    st.global.u32 [%rd4], %r2;
306; PTX-NEXT:    mov.u64 %rd1, escape;
307; PTX-NEXT:    { // callseq 2, 0
308; PTX-NEXT:    .param .b64 param0;
309; PTX-NEXT:    st.param.b64 [param0], %rd6;
310; PTX-NEXT:    .param .b32 retval0;
311; PTX-NEXT:    prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _);
312; PTX-NEXT:    call (retval0),
313; PTX-NEXT:    %rd1,
314; PTX-NEXT:    (
315; PTX-NEXT:    param0
316; PTX-NEXT:    )
317; PTX-NEXT:    , prototype_2;
318; PTX-NEXT:    ld.param.b32 %r3, [retval0];
319; PTX-NEXT:    } // callseq 2
320; PTX-NEXT:    ret;
321; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
322; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
323; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
324; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
325; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
326; OPT-NEXT:    [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
327; OPT-NEXT:    [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
328; OPT-NEXT:    [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
329; OPT-NEXT:    store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
330; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
331; OPT-NEXT:    ret void
332  %val = load i32, ptr %input
333  %twice = add i32 %val, %val
334  store i32 %twice, ptr %output
335  %call = call i32 @escape(ptr %input)
336  ret void
337}
338
339define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
340; PTX-LABEL: grid_const_partial_escapemem(
341; PTX:       {
342; PTX-NEXT:    .reg .b32 %r<6>;
343; PTX-NEXT:    .reg .b64 %rd<7>;
344; PTX-EMPTY:
345; PTX-NEXT:  // %bb.0:
346; PTX-NEXT:    mov.b64 %rd2, grid_const_partial_escapemem_param_0;
347; PTX-NEXT:    ld.param.u64 %rd3, [grid_const_partial_escapemem_param_1];
348; PTX-NEXT:    cvta.to.global.u64 %rd4, %rd3;
349; PTX-NEXT:    mov.u64 %rd5, %rd2;
350; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
351; PTX-NEXT:    ld.u32 %r1, [%rd6];
352; PTX-NEXT:    ld.u32 %r2, [%rd6+4];
353; PTX-NEXT:    st.global.u64 [%rd4], %rd6;
354; PTX-NEXT:    add.s32 %r3, %r1, %r2;
355; PTX-NEXT:    mov.u64 %rd1, escape;
356; PTX-NEXT:    { // callseq 3, 0
357; PTX-NEXT:    .param .b64 param0;
358; PTX-NEXT:    st.param.b64 [param0], %rd6;
359; PTX-NEXT:    .param .b32 retval0;
360; PTX-NEXT:    prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _);
361; PTX-NEXT:    call (retval0),
362; PTX-NEXT:    %rd1,
363; PTX-NEXT:    (
364; PTX-NEXT:    param0
365; PTX-NEXT:    )
366; PTX-NEXT:    , prototype_3;
367; PTX-NEXT:    ld.param.b32 %r4, [retval0];
368; PTX-NEXT:    } // callseq 3
369; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
370; PTX-NEXT:    ret;
371; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem(
372; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
373; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
374; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
375; OPT-NEXT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
376; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
377; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
378; OPT-NEXT:    [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
379; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
380; OPT-NEXT:    [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4
381; OPT-NEXT:    store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8
382; OPT-NEXT:    [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
383; OPT-NEXT:    [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
384; OPT-NEXT:    ret i32 [[ADD]]
385  %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
386  %val1 = load i32, ptr %ptr1
387  %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
388  %val2 = load i32, ptr %ptr2
389  store ptr %input, ptr %output
390  %add = add i32 %val1, %val2
391  %call2 = call i32 @escape(ptr %ptr1)
392  ret i32 %add
393}
394
395define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
396; PTX-LABEL: grid_const_phi(
397; PTX:       {
398; PTX-NEXT:    .reg .pred %p<2>;
399; PTX-NEXT:    .reg .b32 %r<3>;
400; PTX-NEXT:    .reg .b64 %rd<9>;
401; PTX-EMPTY:
402; PTX-NEXT:  // %bb.0:
403; PTX-NEXT:    mov.b64 %rd5, grid_const_phi_param_0;
404; PTX-NEXT:    ld.param.u64 %rd6, [grid_const_phi_param_1];
405; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd6;
406; PTX-NEXT:    mov.u64 %rd7, %rd5;
407; PTX-NEXT:    cvta.param.u64 %rd8, %rd7;
408; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
409; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
410; PTX-NEXT:    @%p1 bra $L__BB9_2;
411; PTX-NEXT:  // %bb.1: // %second
412; PTX-NEXT:    add.s64 %rd8, %rd8, 4;
413; PTX-NEXT:  $L__BB9_2: // %merge
414; PTX-NEXT:    ld.u32 %r2, [%rd8];
415; PTX-NEXT:    st.global.u32 [%rd1], %r2;
416; PTX-NEXT:    ret;
417; OPT-LABEL: define ptx_kernel void @grid_const_phi(
418; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
419; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
420; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
421; OPT-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
422; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
423; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
424; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
425; OPT-NEXT:    br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
426; OPT:       [[FIRST]]:
427; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
428; OPT-NEXT:    br label %[[MERGE:.*]]
429; OPT:       [[SECOND]]:
430; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
431; OPT-NEXT:    br label %[[MERGE]]
432; OPT:       [[MERGE]]:
433; OPT-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
434; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
435; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
436; OPT-NEXT:    ret void
437
438  %val = load i32, ptr %inout
439  %less = icmp slt i32 %val, 0
440  br i1 %less, label %first, label %second
441first:
442  %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
443  br label %merge
444second:
445  %ptr2 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 1
446  br label %merge
447merge:
448  %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
449  %valloaded = load i32, ptr %ptrnew
450  store i32 %valloaded, ptr %inout
451  ret void
452}
453
454; NOTE: %input2 is *not* grid_constant
455define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
456; PTX-LABEL: grid_const_phi_ngc(
457; PTX:       {
458; PTX-NEXT:    .reg .pred %p<2>;
459; PTX-NEXT:    .reg .b32 %r<3>;
460; PTX-NEXT:    .reg .b64 %rd<12>;
461; PTX-EMPTY:
462; PTX-NEXT:  // %bb.0:
463; PTX-NEXT:    mov.b64 %rd6, grid_const_phi_ngc_param_0;
464; PTX-NEXT:    ld.param.u64 %rd7, [grid_const_phi_ngc_param_2];
465; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd7;
466; PTX-NEXT:    mov.u64 %rd10, %rd6;
467; PTX-NEXT:    cvta.param.u64 %rd11, %rd10;
468; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
469; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
470; PTX-NEXT:    @%p1 bra $L__BB10_2;
471; PTX-NEXT:  // %bb.1: // %second
472; PTX-NEXT:    mov.b64 %rd8, grid_const_phi_ngc_param_1;
473; PTX-NEXT:    mov.u64 %rd9, %rd8;
474; PTX-NEXT:    cvta.param.u64 %rd2, %rd9;
475; PTX-NEXT:    add.s64 %rd11, %rd2, 4;
476; PTX-NEXT:  $L__BB10_2: // %merge
477; PTX-NEXT:    ld.u32 %r2, [%rd11];
478; PTX-NEXT:    st.global.u32 [%rd1], %r2;
479; PTX-NEXT:    ret;
480; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
481; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
482; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
483; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
484; OPT-NEXT:    [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
485; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
486; OPT-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
487; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
488; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
489; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
490; OPT-NEXT:    br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
491; OPT:       [[FIRST]]:
492; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
493; OPT-NEXT:    br label %[[MERGE:.*]]
494; OPT:       [[SECOND]]:
495; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
496; OPT-NEXT:    br label %[[MERGE]]
497; OPT:       [[MERGE]]:
498; OPT-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
499; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
500; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
501; OPT-NEXT:    ret void
502  %val = load i32, ptr %inout
503  %less = icmp slt i32 %val, 0
504  br i1 %less, label %first, label %second
505first:
506  %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
507  br label %merge
508second:
509  %ptr2 = getelementptr inbounds %struct.s, ptr %input2, i32 0, i32 1
510  br label %merge
511merge:
512  %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
513  %valloaded = load i32, ptr %ptrnew
514  store i32 %valloaded, ptr %inout
515  ret void
516}
517
518; NOTE: %input2 is *not* grid_constant
519define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
520; PTX-LABEL: grid_const_select(
521; PTX:       {
522; PTX-NEXT:    .reg .pred %p<2>;
523; PTX-NEXT:    .reg .b32 %r<3>;
524; PTX-NEXT:    .reg .b64 %rd<10>;
525; PTX-EMPTY:
526; PTX-NEXT:  // %bb.0:
527; PTX-NEXT:    mov.b64 %rd1, grid_const_select_param_0;
528; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_select_param_2];
529; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
530; PTX-NEXT:    mov.b64 %rd4, grid_const_select_param_1;
531; PTX-NEXT:    mov.u64 %rd5, %rd4;
532; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
533; PTX-NEXT:    mov.u64 %rd7, %rd1;
534; PTX-NEXT:    cvta.param.u64 %rd8, %rd7;
535; PTX-NEXT:    ld.global.u32 %r1, [%rd3];
536; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
537; PTX-NEXT:    selp.b64 %rd9, %rd8, %rd6, %p1;
538; PTX-NEXT:    ld.u32 %r2, [%rd9];
539; PTX-NEXT:    st.global.u32 [%rd3], %r2;
540; PTX-NEXT:    ret;
541; OPT-LABEL: define ptx_kernel void @grid_const_select(
542; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
543; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
544; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
545; OPT-NEXT:    [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
546; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
547; OPT-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
548; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
549; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
550; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
551; OPT-NEXT:    [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
552; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
553; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
554; OPT-NEXT:    ret void
555  %val = load i32, ptr %inout
556  %less = icmp slt i32 %val, 0
557  %ptrnew = select i1 %less, ptr %input1, ptr %input2
558  %valloaded = load i32, ptr %ptrnew
559  store i32 %valloaded, ptr %inout
560  ret void
561}
562
563define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
564; PTX-LABEL: grid_const_ptrtoint(
565; PTX:       {
566; PTX-NEXT:    .reg .b32 %r<4>;
567; PTX-NEXT:    .reg .b64 %rd<4>;
568; PTX-EMPTY:
569; PTX-NEXT:  // %bb.0:
570; PTX-NEXT:    mov.b64 %rd1, grid_const_ptrtoint_param_0;
571; PTX-NEXT:    mov.u64 %rd2, %rd1;
572; PTX-NEXT:    ld.param.u32 %r1, [grid_const_ptrtoint_param_0];
573; PTX-NEXT:    cvta.param.u64 %rd3, %rd2;
574; PTX-NEXT:    cvt.u32.u64 %r2, %rd3;
575; PTX-NEXT:    add.s32 %r3, %r1, %r2;
576; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
577; PTX-NEXT:    ret;
578; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint(
579; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
580; OPT-NEXT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
581; OPT-NEXT:    [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
582; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
583; OPT-NEXT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
584; OPT-NEXT:    [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
585; OPT-NEXT:    ret i32 [[KEEPALIVE]]
586  %val = load i32, ptr %input
587  %ptrval = ptrtoint ptr %input to i32
588  %keepalive = add i32 %val, %ptrval
589  ret i32 %keepalive
590}
591
592
593
594declare dso_local void @dummy() local_unnamed_addr
595declare dso_local ptr @escape(ptr) local_unnamed_addr
596declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
597
598!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23}
599
600!0 = !{ptr @grid_const_int, !"grid_constant", !1}
601!1 = !{i32 1}
602
603!2 = !{ptr @grid_const_struct, !"grid_constant", !3}
604!3 = !{i32 1}
605
606!4 = !{ptr @grid_const_escape, !"grid_constant", !5}
607!5 = !{i32 1}
608
609!6 = !{ptr @multiple_grid_const_escape, !"grid_constant", !7}
610!7 = !{i32 1, i32 3}
611
612!8 = !{ptr @grid_const_memory_escape, !"grid_constant", !9}
613!9 = !{i32 1}
614
615!10 = !{ptr @grid_const_inlineasm_escape, !"grid_constant", !11}
616!11 = !{i32 1}
617
618!12 = !{ptr @grid_const_partial_escape, !"grid_constant", !13}
619!13 = !{i32 1}
620
621!14 = !{ptr @grid_const_partial_escapemem, !"grid_constant", !15}
622!15 = !{i32 1}
623
624!16 = !{ptr @grid_const_phi, !"grid_constant", !17}
625!17 = !{i32 1}
626
627!18 = !{ptr @grid_const_phi_ngc, !"grid_constant", !19}
628!19 = !{i32 1}
629
630!20 = !{ptr @grid_const_select, !"grid_constant", !21}
631!21 = !{i32 1}
632
633!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23}
634!23 = !{i32 1}
635
636
637