1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT 3; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX 4 5%struct.uint4 = type { i32, i32, i32, i32 } 6 7@gi = dso_local addrspace(1) externally_initialized global %struct.uint4 { i32 50462976, i32 117835012, i32 185207048, i32 252579084 }, align 16 8 9; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) 10; Regular functions mus still make a copy. `cvta.param` does not always work there. 11define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly byval(%struct.uint4) align 16 %a, i1 noundef zeroext %b, i32 noundef %c) local_unnamed_addr #0 { 12; OPT-LABEL: define dso_local noundef i32 @non_kernel_function( 13; OPT-SAME: ptr noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 captures(none) [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { 14; OPT-NEXT: [[ENTRY:.*:]] 15; OPT-NEXT: [[A1:%.*]] = alloca [[STRUCT_UINT4]], align 16 16; OPT-NEXT: [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(101) 17; OPT-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 16 [[A1]], ptr addrspace(101) align 16 [[A2]], i64 16, i1 false) 18; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A1]], ptr addrspacecast (ptr addrspace(1) @gi to ptr) 19; OPT-NEXT: [[IDX_EXT:%.*]] = sext i32 [[C]] to i64 20; OPT-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]] 21; OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1 22; OPT-NEXT: ret i32 [[TMP0]] 23; 24; PTX-LABEL: non_kernel_function( 25; PTX: { 26; PTX-NEXT: .local .align 16 .b8 __local_depot0[16]; 27; PTX-NEXT: .reg .b64 %SP; 28; PTX-NEXT: .reg .b64 %SPL; 29; PTX-NEXT: .reg .pred %p<2>; 30; PTX-NEXT: .reg .b16 %rs<3>; 31; PTX-NEXT: .reg .b32 %r<11>; 32; PTX-NEXT: .reg .b64 %rd<9>; 33; PTX-EMPTY: 34; PTX-NEXT: // %bb.0: // %entry 35; PTX-NEXT: mov.u64 %SPL, __local_depot0; 36; PTX-NEXT: cvta.local.u64 %SP, %SPL; 37; PTX-NEXT: ld.param.u8 %rs1, [non_kernel_function_param_1]; 38; PTX-NEXT: and.b16 %rs2, %rs1, 1; 39; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1; 40; PTX-NEXT: ld.param.s32 %rd1, [non_kernel_function_param_2]; 41; PTX-NEXT: ld.param.u64 %rd2, [non_kernel_function_param_0+8]; 42; PTX-NEXT: st.u64 [%SP+8], %rd2; 43; PTX-NEXT: ld.param.u64 %rd3, [non_kernel_function_param_0]; 44; PTX-NEXT: st.u64 [%SP], %rd3; 45; PTX-NEXT: mov.u64 %rd4, gi; 46; PTX-NEXT: cvta.global.u64 %rd5, %rd4; 47; PTX-NEXT: add.u64 %rd6, %SP, 0; 48; PTX-NEXT: selp.b64 %rd7, %rd6, %rd5, %p1; 49; PTX-NEXT: add.s64 %rd8, %rd7, %rd1; 50; PTX-NEXT: ld.u8 %r1, [%rd8]; 51; PTX-NEXT: ld.u8 %r2, [%rd8+1]; 52; PTX-NEXT: shl.b32 %r3, %r2, 8; 53; PTX-NEXT: or.b32 %r4, %r3, %r1; 54; PTX-NEXT: ld.u8 %r5, [%rd8+2]; 55; PTX-NEXT: shl.b32 %r6, %r5, 16; 56; PTX-NEXT: ld.u8 %r7, [%rd8+3]; 57; PTX-NEXT: shl.b32 %r8, %r7, 24; 58; PTX-NEXT: or.b32 %r9, %r8, %r6; 59; PTX-NEXT: or.b32 %r10, %r9, %r4; 60; PTX-NEXT: st.param.b32 [func_retval0], %r10; 61; PTX-NEXT: ret; 62entry: 63 %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr), !dbg !17 64 %idx.ext = sext i32 %c to i64, !dbg !18 65 %add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext, !dbg !18 66 %0 = load i32, ptr %add.ptr, align 1, !dbg !19 67 ret i32 %0, !dbg !23 68} 69 70define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) { 71; PTX-LABEL: grid_const_int( 72; PTX: { 73; PTX-NEXT: .reg .b32 %r<4>; 74; PTX-NEXT: .reg .b64 %rd<3>; 75; PTX-EMPTY: 76; PTX-NEXT: // %bb.0: 77; PTX-NEXT: ld.param.u64 %rd1, [grid_const_int_param_2]; 78; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; 79; PTX-NEXT: ld.param.u32 %r1, [grid_const_int_param_1]; 80; PTX-NEXT: ld.param.u32 %r2, [grid_const_int_param_0]; 81; PTX-NEXT: add.s32 %r3, %r2, %r1; 82; PTX-NEXT: st.global.u32 [%rd2], %r3; 83; PTX-NEXT: ret; 84; OPT-LABEL: define ptx_kernel void @grid_const_int( 85; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { 86; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) 87; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr 88; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) 89; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 90; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] 91; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4 92; OPT-NEXT: ret void 93 %tmp = load i32, ptr %input1, align 4 94 %add = add i32 %tmp, %input2 95 store i32 %add, ptr %out 96 ret void 97} 98 99%struct.s = type { i32, i32 } 100 101define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ 102; PTX-LABEL: grid_const_struct( 103; PTX: { 104; PTX-NEXT: .reg .b32 %r<4>; 105; PTX-NEXT: .reg .b64 %rd<3>; 106; PTX-EMPTY: 107; PTX-NEXT: // %bb.0: 108; PTX-NEXT: ld.param.u64 %rd1, [grid_const_struct_param_1]; 109; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; 110; PTX-NEXT: ld.param.u32 %r1, [grid_const_struct_param_0]; 111; PTX-NEXT: ld.param.u32 %r2, [grid_const_struct_param_0+4]; 112; PTX-NEXT: add.s32 %r3, %r1, %r2; 113; PTX-NEXT: st.global.u32 [%rd2], %r3; 114; PTX-NEXT: ret; 115; OPT-LABEL: define ptx_kernel void @grid_const_struct( 116; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { 117; OPT-NEXT: [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) 118; OPT-NEXT: [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr 119; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) 120; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 121; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 122; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 123; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 124; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]] 125; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4 126; OPT-NEXT: ret void 127 %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 128 %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 129 %int1 = load i32, ptr %gep1 130 %int2 = load i32, ptr %gep2 131 %add = add i32 %int1, %int2 132 store i32 %add, ptr %out 133 ret void 134} 135 136define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { 137; PTX-LABEL: grid_const_escape( 138; PTX: { 139; PTX-NEXT: .reg .b32 %r<3>; 140; PTX-NEXT: .reg .b64 %rd<5>; 141; PTX-EMPTY: 142; PTX-NEXT: // %bb.0: 143; PTX-NEXT: mov.b64 %rd2, grid_const_escape_param_0; 144; PTX-NEXT: mov.u64 %rd3, %rd2; 145; PTX-NEXT: cvta.param.u64 %rd4, %rd3; 146; PTX-NEXT: mov.u64 %rd1, escape; 147; PTX-NEXT: { // callseq 0, 0 148; PTX-NEXT: .param .b64 param0; 149; PTX-NEXT: st.param.b64 [param0], %rd4; 150; PTX-NEXT: .param .b32 retval0; 151; PTX-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _); 152; PTX-NEXT: call (retval0), 153; PTX-NEXT: %rd1, 154; PTX-NEXT: ( 155; PTX-NEXT: param0 156; PTX-NEXT: ) 157; PTX-NEXT: , prototype_0; 158; PTX-NEXT: ld.param.b32 %r1, [retval0]; 159; PTX-NEXT: } // callseq 0 160; PTX-NEXT: ret; 161; OPT-LABEL: define ptx_kernel void @grid_const_escape( 162; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] { 163; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) 164; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) 165; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) 166; OPT-NEXT: ret void 167 %call = call i32 @escape(ptr %input) 168 ret void 169} 170 171define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) { 172; PTX-LABEL: multiple_grid_const_escape( 173; PTX: { 174; PTX-NEXT: .local .align 4 .b8 __local_depot4[4]; 175; PTX-NEXT: .reg .b64 %SP; 176; PTX-NEXT: .reg .b64 %SPL; 177; PTX-NEXT: .reg .b32 %r<4>; 178; PTX-NEXT: .reg .b64 %rd<10>; 179; PTX-EMPTY: 180; PTX-NEXT: // %bb.0: 181; PTX-NEXT: mov.u64 %SPL, __local_depot4; 182; PTX-NEXT: cvta.local.u64 %SP, %SPL; 183; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_0; 184; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2; 185; PTX-NEXT: mov.u64 %rd4, %rd3; 186; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1]; 187; PTX-NEXT: cvta.param.u64 %rd5, %rd4; 188; PTX-NEXT: mov.u64 %rd6, %rd2; 189; PTX-NEXT: cvta.param.u64 %rd7, %rd6; 190; PTX-NEXT: add.u64 %rd8, %SP, 0; 191; PTX-NEXT: add.u64 %rd9, %SPL, 0; 192; PTX-NEXT: st.local.u32 [%rd9], %r1; 193; PTX-NEXT: mov.u64 %rd1, escape3; 194; PTX-NEXT: { // callseq 1, 0 195; PTX-NEXT: .param .b64 param0; 196; PTX-NEXT: st.param.b64 [param0], %rd7; 197; PTX-NEXT: .param .b64 param1; 198; PTX-NEXT: st.param.b64 [param1], %rd8; 199; PTX-NEXT: .param .b64 param2; 200; PTX-NEXT: st.param.b64 [param2], %rd5; 201; PTX-NEXT: .param .b32 retval0; 202; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _); 203; PTX-NEXT: call (retval0), 204; PTX-NEXT: %rd1, 205; PTX-NEXT: ( 206; PTX-NEXT: param0, 207; PTX-NEXT: param1, 208; PTX-NEXT: param2 209; PTX-NEXT: ) 210; PTX-NEXT: , prototype_1; 211; PTX-NEXT: ld.param.b32 %r2, [retval0]; 212; PTX-NEXT: } // callseq 1 213; PTX-NEXT: ret; 214; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape( 215; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] { 216; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) 217; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) 218; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) 219; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) 220; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 221; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 222; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]]) 223; OPT-NEXT: ret void 224 %a.addr = alloca i32, align 4 225 store i32 %a, ptr %a.addr, align 4 226 %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b) 227 ret void 228} 229 230define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) { 231; PTX-LABEL: grid_const_memory_escape( 232; PTX: { 233; PTX-NEXT: .reg .b64 %rd<6>; 234; PTX-EMPTY: 235; PTX-NEXT: // %bb.0: 236; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0; 237; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1]; 238; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; 239; PTX-NEXT: mov.u64 %rd4, %rd1; 240; PTX-NEXT: cvta.param.u64 %rd5, %rd4; 241; PTX-NEXT: st.global.u64 [%rd3], %rd5; 242; PTX-NEXT: ret; 243; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape( 244; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { 245; OPT-NEXT: [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) 246; OPT-NEXT: [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr 247; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) 248; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) 249; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8 250; OPT-NEXT: ret void 251 store ptr %input, ptr %addr, align 8 252 ret void 253} 254 255define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) { 256; PTX-LABEL: grid_const_inlineasm_escape( 257; PTX: { 258; PTX-NEXT: .reg .b64 %rd<8>; 259; PTX-EMPTY: 260; PTX-NEXT: // %bb.0: 261; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0; 262; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1]; 263; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5; 264; PTX-NEXT: mov.u64 %rd7, %rd4; 265; PTX-NEXT: cvta.param.u64 %rd2, %rd7; 266; PTX-NEXT: add.s64 %rd3, %rd2, 4; 267; PTX-NEXT: // begin inline asm 268; PTX-NEXT: add.s64 %rd1, %rd2, %rd3; 269; PTX-NEXT: // end inline asm 270; PTX-NEXT: st.global.u64 [%rd6], %rd1; 271; PTX-NEXT: ret; 272; PTX-NOT .local 273; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape( 274; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { 275; OPT-NEXT: [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1) 276; OPT-NEXT: [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr 277; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) 278; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) 279; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 280; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 281; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 282; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8 283; OPT-NEXT: ret void 284 %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 285 %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 286 %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1 287 store i64 %1, ptr %result, align 8 288 ret void 289} 290 291define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { 292; PTX-LABEL: grid_const_partial_escape( 293; PTX: { 294; PTX-NEXT: .reg .b32 %r<5>; 295; PTX-NEXT: .reg .b64 %rd<7>; 296; PTX-EMPTY: 297; PTX-NEXT: // %bb.0: 298; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escape_param_0; 299; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escape_param_1]; 300; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3; 301; PTX-NEXT: mov.u64 %rd5, %rd2; 302; PTX-NEXT: cvta.param.u64 %rd6, %rd5; 303; PTX-NEXT: ld.u32 %r1, [%rd6]; 304; PTX-NEXT: add.s32 %r2, %r1, %r1; 305; PTX-NEXT: st.global.u32 [%rd4], %r2; 306; PTX-NEXT: mov.u64 %rd1, escape; 307; PTX-NEXT: { // callseq 2, 0 308; PTX-NEXT: .param .b64 param0; 309; PTX-NEXT: st.param.b64 [param0], %rd6; 310; PTX-NEXT: .param .b32 retval0; 311; PTX-NEXT: prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _); 312; PTX-NEXT: call (retval0), 313; PTX-NEXT: %rd1, 314; PTX-NEXT: ( 315; PTX-NEXT: param0 316; PTX-NEXT: ) 317; PTX-NEXT: , prototype_2; 318; PTX-NEXT: ld.param.b32 %r3, [retval0]; 319; PTX-NEXT: } // callseq 2 320; PTX-NEXT: ret; 321; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape( 322; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { 323; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) 324; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr 325; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) 326; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) 327; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 328; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]] 329; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4 330; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) 331; OPT-NEXT: ret void 332 %val = load i32, ptr %input 333 %twice = add i32 %val, %val 334 store i32 %twice, ptr %output 335 %call = call i32 @escape(ptr %input) 336 ret void 337} 338 339define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) { 340; PTX-LABEL: grid_const_partial_escapemem( 341; PTX: { 342; PTX-NEXT: .reg .b32 %r<6>; 343; PTX-NEXT: .reg .b64 %rd<7>; 344; PTX-EMPTY: 345; PTX-NEXT: // %bb.0: 346; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escapemem_param_0; 347; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escapemem_param_1]; 348; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3; 349; PTX-NEXT: mov.u64 %rd5, %rd2; 350; PTX-NEXT: cvta.param.u64 %rd6, %rd5; 351; PTX-NEXT: ld.u32 %r1, [%rd6]; 352; PTX-NEXT: ld.u32 %r2, [%rd6+4]; 353; PTX-NEXT: st.global.u64 [%rd4], %rd6; 354; PTX-NEXT: add.s32 %r3, %r1, %r2; 355; PTX-NEXT: mov.u64 %rd1, escape; 356; PTX-NEXT: { // callseq 3, 0 357; PTX-NEXT: .param .b64 param0; 358; PTX-NEXT: st.param.b64 [param0], %rd6; 359; PTX-NEXT: .param .b32 retval0; 360; PTX-NEXT: prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _); 361; PTX-NEXT: call (retval0), 362; PTX-NEXT: %rd1, 363; PTX-NEXT: ( 364; PTX-NEXT: param0 365; PTX-NEXT: ) 366; PTX-NEXT: , prototype_3; 367; PTX-NEXT: ld.param.b32 %r4, [retval0]; 368; PTX-NEXT: } // callseq 3 369; PTX-NEXT: st.param.b32 [func_retval0], %r3; 370; PTX-NEXT: ret; 371; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem( 372; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { 373; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) 374; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr 375; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) 376; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) 377; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 378; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4 379; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 380; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4 381; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8 382; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] 383; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) 384; OPT-NEXT: ret i32 [[ADD]] 385 %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 386 %val1 = load i32, ptr %ptr1 387 %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 388 %val2 = load i32, ptr %ptr2 389 store ptr %input, ptr %output 390 %add = add i32 %val1, %val2 391 %call2 = call i32 @escape(ptr %ptr1) 392 ret i32 %add 393} 394 395define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) { 396; PTX-LABEL: grid_const_phi( 397; PTX: { 398; PTX-NEXT: .reg .pred %p<2>; 399; PTX-NEXT: .reg .b32 %r<3>; 400; PTX-NEXT: .reg .b64 %rd<9>; 401; PTX-EMPTY: 402; PTX-NEXT: // %bb.0: 403; PTX-NEXT: mov.b64 %rd5, grid_const_phi_param_0; 404; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_param_1]; 405; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6; 406; PTX-NEXT: mov.u64 %rd7, %rd5; 407; PTX-NEXT: cvta.param.u64 %rd8, %rd7; 408; PTX-NEXT: ld.global.u32 %r1, [%rd1]; 409; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; 410; PTX-NEXT: @%p1 bra $L__BB9_2; 411; PTX-NEXT: // %bb.1: // %second 412; PTX-NEXT: add.s64 %rd8, %rd8, 4; 413; PTX-NEXT: $L__BB9_2: // %merge 414; PTX-NEXT: ld.u32 %r2, [%rd8]; 415; PTX-NEXT: st.global.u32 [%rd1], %r2; 416; PTX-NEXT: ret; 417; OPT-LABEL: define ptx_kernel void @grid_const_phi( 418; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { 419; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) 420; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr 421; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) 422; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) 423; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 424; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 425; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] 426; OPT: [[FIRST]]: 427; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 428; OPT-NEXT: br label %[[MERGE:.*]] 429; OPT: [[SECOND]]: 430; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1 431; OPT-NEXT: br label %[[MERGE]] 432; OPT: [[MERGE]]: 433; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] 434; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 435; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 436; OPT-NEXT: ret void 437 438 %val = load i32, ptr %inout 439 %less = icmp slt i32 %val, 0 440 br i1 %less, label %first, label %second 441first: 442 %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0 443 br label %merge 444second: 445 %ptr2 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 1 446 br label %merge 447merge: 448 %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second] 449 %valloaded = load i32, ptr %ptrnew 450 store i32 %valloaded, ptr %inout 451 ret void 452} 453 454; NOTE: %input2 is *not* grid_constant 455define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { 456; PTX-LABEL: grid_const_phi_ngc( 457; PTX: { 458; PTX-NEXT: .reg .pred %p<2>; 459; PTX-NEXT: .reg .b32 %r<3>; 460; PTX-NEXT: .reg .b64 %rd<12>; 461; PTX-EMPTY: 462; PTX-NEXT: // %bb.0: 463; PTX-NEXT: mov.b64 %rd6, grid_const_phi_ngc_param_0; 464; PTX-NEXT: ld.param.u64 %rd7, [grid_const_phi_ngc_param_2]; 465; PTX-NEXT: cvta.to.global.u64 %rd1, %rd7; 466; PTX-NEXT: mov.u64 %rd10, %rd6; 467; PTX-NEXT: cvta.param.u64 %rd11, %rd10; 468; PTX-NEXT: ld.global.u32 %r1, [%rd1]; 469; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; 470; PTX-NEXT: @%p1 bra $L__BB10_2; 471; PTX-NEXT: // %bb.1: // %second 472; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1; 473; PTX-NEXT: mov.u64 %rd9, %rd8; 474; PTX-NEXT: cvta.param.u64 %rd2, %rd9; 475; PTX-NEXT: add.s64 %rd11, %rd2, 4; 476; PTX-NEXT: $L__BB10_2: // %merge 477; PTX-NEXT: ld.u32 %r2, [%rd11]; 478; PTX-NEXT: st.global.u32 [%rd1], %r2; 479; PTX-NEXT: ret; 480; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc( 481; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { 482; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) 483; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr 484; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) 485; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) 486; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) 487; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) 488; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 489; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 490; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] 491; OPT: [[FIRST]]: 492; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 493; OPT-NEXT: br label %[[MERGE:.*]] 494; OPT: [[SECOND]]: 495; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1 496; OPT-NEXT: br label %[[MERGE]] 497; OPT: [[MERGE]]: 498; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] 499; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 500; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 501; OPT-NEXT: ret void 502 %val = load i32, ptr %inout 503 %less = icmp slt i32 %val, 0 504 br i1 %less, label %first, label %second 505first: 506 %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0 507 br label %merge 508second: 509 %ptr2 = getelementptr inbounds %struct.s, ptr %input2, i32 0, i32 1 510 br label %merge 511merge: 512 %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second] 513 %valloaded = load i32, ptr %ptrnew 514 store i32 %valloaded, ptr %inout 515 ret void 516} 517 518; NOTE: %input2 is *not* grid_constant 519define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { 520; PTX-LABEL: grid_const_select( 521; PTX: { 522; PTX-NEXT: .reg .pred %p<2>; 523; PTX-NEXT: .reg .b32 %r<3>; 524; PTX-NEXT: .reg .b64 %rd<10>; 525; PTX-EMPTY: 526; PTX-NEXT: // %bb.0: 527; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0; 528; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2]; 529; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; 530; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1; 531; PTX-NEXT: mov.u64 %rd5, %rd4; 532; PTX-NEXT: cvta.param.u64 %rd6, %rd5; 533; PTX-NEXT: mov.u64 %rd7, %rd1; 534; PTX-NEXT: cvta.param.u64 %rd8, %rd7; 535; PTX-NEXT: ld.global.u32 %r1, [%rd3]; 536; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; 537; PTX-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1; 538; PTX-NEXT: ld.u32 %r2, [%rd9]; 539; PTX-NEXT: st.global.u32 [%rd3], %r2; 540; PTX-NEXT: ret; 541; OPT-LABEL: define ptx_kernel void @grid_const_select( 542; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { 543; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) 544; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr 545; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) 546; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) 547; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) 548; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) 549; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 550; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 551; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] 552; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 553; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 554; OPT-NEXT: ret void 555 %val = load i32, ptr %inout 556 %less = icmp slt i32 %val, 0 557 %ptrnew = select i1 %less, ptr %input1, ptr %input2 558 %valloaded = load i32, ptr %ptrnew 559 store i32 %valloaded, ptr %inout 560 ret void 561} 562 563define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { 564; PTX-LABEL: grid_const_ptrtoint( 565; PTX: { 566; PTX-NEXT: .reg .b32 %r<4>; 567; PTX-NEXT: .reg .b64 %rd<4>; 568; PTX-EMPTY: 569; PTX-NEXT: // %bb.0: 570; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0; 571; PTX-NEXT: mov.u64 %rd2, %rd1; 572; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0]; 573; PTX-NEXT: cvta.param.u64 %rd3, %rd2; 574; PTX-NEXT: cvt.u32.u64 %r2, %rd3; 575; PTX-NEXT: add.s32 %r3, %r1, %r2; 576; PTX-NEXT: st.param.b32 [func_retval0], %r3; 577; PTX-NEXT: ret; 578; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint( 579; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { 580; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) 581; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4 582; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) 583; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 584; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]] 585; OPT-NEXT: ret i32 [[KEEPALIVE]] 586 %val = load i32, ptr %input 587 %ptrval = ptrtoint ptr %input to i32 588 %keepalive = add i32 %val, %ptrval 589 ret i32 %keepalive 590} 591 592 593 594declare dso_local void @dummy() local_unnamed_addr 595declare dso_local ptr @escape(ptr) local_unnamed_addr 596declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr 597 598!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23} 599 600!0 = !{ptr @grid_const_int, !"grid_constant", !1} 601!1 = !{i32 1} 602 603!2 = !{ptr @grid_const_struct, !"grid_constant", !3} 604!3 = !{i32 1} 605 606!4 = !{ptr @grid_const_escape, !"grid_constant", !5} 607!5 = !{i32 1} 608 609!6 = !{ptr @multiple_grid_const_escape, !"grid_constant", !7} 610!7 = !{i32 1, i32 3} 611 612!8 = !{ptr @grid_const_memory_escape, !"grid_constant", !9} 613!9 = !{i32 1} 614 615!10 = !{ptr @grid_const_inlineasm_escape, !"grid_constant", !11} 616!11 = !{i32 1} 617 618!12 = !{ptr @grid_const_partial_escape, !"grid_constant", !13} 619!13 = !{i32 1} 620 621!14 = !{ptr @grid_const_partial_escapemem, !"grid_constant", !15} 622!15 = !{i32 1} 623 624!16 = !{ptr @grid_const_phi, !"grid_constant", !17} 625!17 = !{i32 1} 626 627!18 = !{ptr @grid_const_phi_ngc, !"grid_constant", !19} 628!19 = !{i32 1} 629 630!20 = !{ptr @grid_const_select, !"grid_constant", !21} 631!21 = !{i32 1} 632 633!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23} 634!23 = !{i32 1} 635 636 637