1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals 2; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU 3; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX 4 5%struct.ident_t = type { i32, i32, i32, i32, ptr } 6%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 } 7%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr } 8 9@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 10@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8 11@spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } 12@spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } 13@spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } 14@spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } 15 16;. 17; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" 18; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8 19; AMDGPU: @spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } 20; AMDGPU: @spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } 21; AMDGPU: @spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } 22; AMDGPU: @spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } 23;. 24; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" 25; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8 26; NVPTX: @spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } 27; NVPTX: @spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } 28; NVPTX: @spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } 29; NVPTX: @spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } 30;. 31define weak ptx_kernel void @spmd_callees(i1 %c) #0 { 32; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees 33; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] { 34; AMDGPU-NEXT: call void @spmd_callees__debug(i1 [[C]]) 35; AMDGPU-NEXT: ret void 36; 37; NVPTX-LABEL: define {{[^@]+}}@spmd_callees 38; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] { 39; NVPTX-NEXT: call void @spmd_callees__debug(i1 [[C]]) 40; NVPTX-NEXT: ret void 41; 42 call void @spmd_callees__debug(i1 %c) 43 ret void 44} 45 46define internal void @spmd_callees__debug(i1 %c) { 47; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees__debug 48; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] { 49; AMDGPU-NEXT: entry: 50; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 51; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 52; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null) 53; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 54; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] 55; AMDGPU: common.ret: 56; AMDGPU-NEXT: ret void 57; AMDGPU: user_code.entry: 58; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]] 59; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 60; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] 61; AMDGPU-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2 62; AMDGPU-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2 63; AMDGPU-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] 64; AMDGPU: 3: 65; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 66; AMDGPU-NEXT: br label [[TMP7:%.*]] 67; AMDGPU: 4: 68; AMDGPU-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] 69; AMDGPU: 5: 70; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 71; AMDGPU-NEXT: br label [[TMP7]] 72; AMDGPU: 6: 73; AMDGPU-NEXT: unreachable 74; AMDGPU: 7: 75; AMDGPU-NEXT: call void @__kmpc_target_deinit() 76; AMDGPU-NEXT: br label [[COMMON_RET]] 77; 78; NVPTX-LABEL: define {{[^@]+}}@spmd_callees__debug 79; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] { 80; NVPTX-NEXT: entry: 81; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 82; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 83; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null) 84; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 85; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] 86; NVPTX: common.ret: 87; NVPTX-NEXT: ret void 88; NVPTX: user_code.entry: 89; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]] 90; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 91; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] 92; NVPTX-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2 93; NVPTX-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2 94; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] 95; NVPTX: 3: 96; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 97; NVPTX-NEXT: br label [[TMP7:%.*]] 98; NVPTX: 4: 99; NVPTX-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] 100; NVPTX: 5: 101; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 102; NVPTX-NEXT: br label [[TMP7]] 103; NVPTX: 6: 104; NVPTX-NEXT: unreachable 105; NVPTX: 7: 106; NVPTX-NEXT: call void @__kmpc_target_deinit() 107; NVPTX-NEXT: br label [[COMMON_RET]] 108; 109entry: 110 %.zero.addr = alloca i32, align 4 111 %.threadid_temp. = alloca i32, align 4 112 %0 = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null) 113 %exec_user_code = icmp eq i32 %0, -1 114 br i1 %exec_user_code, label %user_code.entry, label %common.ret 115 116common.ret: ; preds = %entry, %user_code.entry 117 ret void 118 119user_code.entry: ; preds = %entry 120 %1 = call i32 @__kmpc_global_thread_num(ptr @1) 121 store i32 0, ptr %.zero.addr, align 4 122 store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18 123 %fp = select i1 %c, ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2 124 call void %fp(ptr %.threadid_temp., ptr %.zero.addr) #6 125 call void @__kmpc_target_deinit() 126 br label %common.ret 127} 128 129; Function Attrs: alwaysinline convergent norecurse nounwind 130define internal void @__omp_outlined_spmd_amenable1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) { 131; 132; 133; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable1 134; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 135; AMDGPU-NEXT: entry: 136; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 137; AMDGPU-NEXT: br label [[FOR_COND:%.*]] 138; AMDGPU: for.cond: 139; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] 140; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 141; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 142; AMDGPU: for.cond.cleanup: 143; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] 144; AMDGPU-NEXT: ret void 145; AMDGPU: for.body: 146; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] 147; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) 148; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 149; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] 150; 151; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable1 152; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 153; NVPTX-NEXT: entry: 154; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 155; NVPTX-NEXT: br label [[FOR_COND:%.*]] 156; NVPTX: for.cond: 157; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] 158; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 159; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 160; NVPTX: for.cond.cleanup: 161; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] 162; NVPTX-NEXT: ret void 163; NVPTX: for.body: 164; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] 165; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) 166; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 167; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] 168; 169entry: 170 %captured_vars_addrs = alloca [0 x ptr], align 8 171 br label %for.cond 172 173for.cond: ; preds = %for.body, %entry 174 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 175 %cmp = icmp slt i32 %i.0, 100 176 br i1 %cmp, label %for.body, label %for.cond.cleanup 177 178for.cond.cleanup: ; preds = %for.cond 179 call void @spmd_amenable() #10 180 ret void 181 182for.body: ; preds = %for.cond 183 %0 = load i32, ptr %.global_tid., align 4, !tbaa !18 184 call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr %captured_vars_addrs, i64 0) 185 %inc = add nsw i32 %i.0, 1 186 br label %for.cond, !llvm.loop !22 187} 188 189; Function Attrs: alwaysinline convergent norecurse nounwind 190define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) { 191; 192; 193; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1 194; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 195; AMDGPU-NEXT: entry: 196; AMDGPU-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] 197; AMDGPU-NEXT: ret void 198; 199; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1 200; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 201; NVPTX-NEXT: entry: 202; NVPTX-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] 203; NVPTX-NEXT: ret void 204; 205entry: 206 call void @unknown() #11 207 ret void 208} 209 210; Function Attrs: convergent norecurse nounwind 211define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { 212; 213; 214; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 215; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { 216; AMDGPU-NEXT: entry: 217; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 218; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 219; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 220; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) 221; AMDGPU-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 222; AMDGPU-NEXT: ret void 223; 224; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 225; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { 226; NVPTX-NEXT: entry: 227; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 228; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 229; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 230; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) 231; NVPTX-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 232; NVPTX-NEXT: ret void 233; 234entry: 235 %.addr1 = alloca i32, align 4 236 %.zero.addr = alloca i32, align 4 237 %global_args = alloca ptr, align 8 238 store i32 %1, ptr %.addr1, align 4, !tbaa !18 239 store i32 0, ptr %.zero.addr, align 4 240 call void @__kmpc_get_shared_variables(ptr %global_args) 241 call void @__omp_outlined__1(ptr %.addr1, ptr %.zero.addr) #6 242 ret void 243} 244 245; Function Attrs: alwaysinline convergent norecurse nounwind 246define internal void @__omp_outlined_spmd_amenable2(ptr noalias %.global_tid., ptr noalias %.bound_tid.) { 247; 248; 249; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable2 250; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 251; AMDGPU-NEXT: entry: 252; AMDGPU-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5) 253; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 254; AMDGPU-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr 255; AMDGPU-NEXT: call void @use(ptr captures(none) [[MALLOC_CAST]]) #[[ATTR6]] 256; AMDGPU-NEXT: br label [[FOR_COND:%.*]] 257; AMDGPU: for.cond: 258; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] 259; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 260; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 261; AMDGPU: for.cond.cleanup: 262; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] 263; AMDGPU-NEXT: ret void 264; AMDGPU: for.body: 265; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] 266; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) 267; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 268; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] 269; 270; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable2 271; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 272; NVPTX-NEXT: entry: 273; NVPTX-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4 274; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 275; NVPTX-NEXT: call void @use(ptr captures(none) [[X_H2S]]) #[[ATTR6]] 276; NVPTX-NEXT: br label [[FOR_COND:%.*]] 277; NVPTX: for.cond: 278; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] 279; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 280; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 281; NVPTX: for.cond.cleanup: 282; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] 283; NVPTX-NEXT: ret void 284; NVPTX: for.body: 285; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] 286; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) 287; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 288; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] 289; 290entry: 291 %captured_vars_addrs = alloca [0 x ptr], align 8 292 %x = call align 4 ptr @__kmpc_alloc_shared(i64 4) 293 call void @use(ptr nocapture %x) #10 294 br label %for.cond 295 296for.cond: ; preds = %for.body, %entry 297 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 298 %cmp = icmp slt i32 %i.0, 100 299 br i1 %cmp, label %for.body, label %for.cond.cleanup 300 301for.cond.cleanup: ; preds = %for.cond 302 call void @spmd_amenable() #10 303 call void @__kmpc_free_shared(ptr %x, i64 4) 304 ret void 305 306for.body: ; preds = %for.cond 307 %0 = load i32, ptr %.global_tid., align 4, !tbaa !18 308 call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr %captured_vars_addrs, i64 0) 309 %inc = add nsw i32 %i.0, 1 310 br label %for.cond, !llvm.loop !25 311} 312; Function Attrs: alwaysinline convergent norecurse nounwind 313define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) { 314; 315; 316; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3 317; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 318; AMDGPU-NEXT: entry: 319; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] 320; AMDGPU-NEXT: ret void 321; 322; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3 323; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 324; NVPTX-NEXT: entry: 325; NVPTX-NEXT: call void @unknown() #[[ATTR7]] 326; NVPTX-NEXT: ret void 327; 328entry: 329 call void @unknown() #11 330 ret void 331} 332 333; Function Attrs: convergent norecurse nounwind 334define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { 335; 336; 337; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 338; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { 339; AMDGPU-NEXT: entry: 340; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 341; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 342; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 343; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) 344; AMDGPU-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 345; AMDGPU-NEXT: ret void 346; 347; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 348; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { 349; NVPTX-NEXT: entry: 350; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 351; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 352; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 353; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) 354; NVPTX-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 355; NVPTX-NEXT: ret void 356; 357entry: 358 %.addr1 = alloca i32, align 4 359 %.zero.addr = alloca i32, align 4 360 %global_args = alloca ptr, align 8 361 store i32 %1, ptr %.addr1, align 4, !tbaa !18 362 store i32 0, ptr %.zero.addr, align 4 363 call void @__kmpc_get_shared_variables(ptr %global_args) 364 call void @__omp_outlined__3(ptr %.addr1, ptr %.zero.addr) #6 365 ret void 366} 367 368 369; Function Attrs: alwaysinline convergent norecurse nounwind 370define weak ptx_kernel void @spmd_and_non_spmd_callee(i1 %c) #0 { 371; 372; 373; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callee 374; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR0]] { 375; AMDGPU-NEXT: entry: 376; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) 377; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 378; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 379; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null) 380; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 381; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] 382; AMDGPU: is_worker_check: 383; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() 384; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() 385; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] 386; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] 387; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] 388; AMDGPU: worker_state_machine.begin: 389; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) 390; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr 391; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) 392; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 393; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null 394; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] 395; AMDGPU: worker_state_machine.finished: 396; AMDGPU-NEXT: ret void 397; AMDGPU: worker_state_machine.is_active.check: 398; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] 399; AMDGPU: worker_state_machine.parallel_region.fallback.execute: 400; AMDGPU-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) 401; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] 402; AMDGPU: worker_state_machine.parallel_region.end: 403; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() 404; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] 405; AMDGPU: worker_state_machine.done.barrier: 406; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) 407; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] 408; AMDGPU: thread.user_code.check: 409; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 410; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] 411; AMDGPU: common.ret: 412; AMDGPU-NEXT: ret void 413; AMDGPU: user_code.entry: 414; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] 415; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 416; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] 417; AMDGPU-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable 418; AMDGPU-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable 419; AMDGPU-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] 420; AMDGPU: 3: 421; AMDGPU-NEXT: call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 422; AMDGPU-NEXT: br label [[TMP7:%.*]] 423; AMDGPU: 4: 424; AMDGPU-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] 425; AMDGPU: 5: 426; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 427; AMDGPU-NEXT: br label [[TMP7]] 428; AMDGPU: 6: 429; AMDGPU-NEXT: unreachable 430; AMDGPU: 7: 431; AMDGPU-NEXT: call void @__kmpc_target_deinit() 432; AMDGPU-NEXT: br label [[COMMON_RET]] 433; 434; NVPTX-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callee 435; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR0]] { 436; NVPTX-NEXT: entry: 437; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 438; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 439; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 440; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null) 441; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 442; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] 443; NVPTX: is_worker_check: 444; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() 445; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() 446; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] 447; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] 448; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] 449; NVPTX: worker_state_machine.begin: 450; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) 451; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) 452; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 453; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null 454; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] 455; NVPTX: worker_state_machine.finished: 456; NVPTX-NEXT: ret void 457; NVPTX: worker_state_machine.is_active.check: 458; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] 459; NVPTX: worker_state_machine.parallel_region.fallback.execute: 460; NVPTX-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) 461; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] 462; NVPTX: worker_state_machine.parallel_region.end: 463; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() 464; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] 465; NVPTX: worker_state_machine.done.barrier: 466; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) 467; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] 468; NVPTX: thread.user_code.check: 469; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 470; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] 471; NVPTX: common.ret: 472; NVPTX-NEXT: ret void 473; NVPTX: user_code.entry: 474; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] 475; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 476; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] 477; NVPTX-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable 478; NVPTX-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable 479; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] 480; NVPTX: 3: 481; NVPTX-NEXT: call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 482; NVPTX-NEXT: br label [[TMP7:%.*]] 483; NVPTX: 4: 484; NVPTX-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] 485; NVPTX: 5: 486; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] 487; NVPTX-NEXT: br label [[TMP7]] 488; NVPTX: 6: 489; NVPTX-NEXT: unreachable 490; NVPTX: 7: 491; NVPTX-NEXT: call void @__kmpc_target_deinit() 492; NVPTX-NEXT: br label [[COMMON_RET]] 493; 494entry: 495 %.zero.addr = alloca i32, align 4 496 %.threadid_temp. = alloca i32, align 4 497 %0 = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null) 498 %exec_user_code = icmp eq i32 %0, -1 499 br i1 %exec_user_code, label %user_code.entry, label %common.ret 500 501common.ret: ; preds = %entry, %user_code.entry 502 ret void 503 504user_code.entry: ; preds = %entry 505 %1 = call i32 @__kmpc_global_thread_num(ptr @1) 506 store i32 0, ptr %.zero.addr, align 4 507 store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18 508 %fp = select i1 %c, ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable 509 call void %fp(ptr %.threadid_temp., ptr %.zero.addr) #6 510 call void @__kmpc_target_deinit() 511 br label %common.ret 512} 513 514; Function Attrs: alwaysinline convergent norecurse nounwind 515define internal void @__omp_outlined_spmd_amenable3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) { 516; 517; 518; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable3 519; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 520; AMDGPU-NEXT: entry: 521; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 522; AMDGPU-NEXT: [[X:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR10]] 523; AMDGPU-NEXT: br label [[FOR_COND:%.*]] 524; AMDGPU: for.cond: 525; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] 526; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 527; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 528; AMDGPU: for.cond.cleanup: 529; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] 530; AMDGPU-NEXT: call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]] 531; AMDGPU-NEXT: ret void 532; AMDGPU: for.body: 533; AMDGPU-NEXT: store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] 534; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] 535; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) 536; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 537; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] 538; 539; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable3 540; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 541; NVPTX-NEXT: entry: 542; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 543; NVPTX-NEXT: [[X:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR10]] 544; NVPTX-NEXT: br label [[FOR_COND:%.*]] 545; NVPTX: for.cond: 546; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] 547; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 548; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 549; NVPTX: for.cond.cleanup: 550; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] 551; NVPTX-NEXT: call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]] 552; NVPTX-NEXT: ret void 553; NVPTX: for.body: 554; NVPTX-NEXT: store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] 555; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] 556; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) 557; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 558; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] 559; 560entry: 561 %captured_vars_addrs = alloca [1 x ptr], align 8 562 %x = call align 4 ptr @__kmpc_alloc_shared(i64 4) 563 br label %for.cond 564 565for.cond: ; preds = %for.body, %entry 566 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 567 %cmp = icmp slt i32 %i.0, 100 568 br i1 %cmp, label %for.body, label %for.cond.cleanup 569 570for.cond.cleanup: ; preds = %for.cond 571 call void @spmd_amenable() #10 572 call void @__kmpc_free_shared(ptr %x, i64 4) 573 ret void 574 575for.body: ; preds = %for.cond 576 store ptr %x, ptr %captured_vars_addrs, align 8, !tbaa !26 577 %0 = load i32, ptr %.global_tid., align 4, !tbaa !18 578 call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr %captured_vars_addrs, i64 1) 579 %inc = add nsw i32 %i.0, 1 580 br label %for.cond, !llvm.loop !28 581} 582 583; Function Attrs: alwaysinline convergent norecurse nounwind 584define internal void @__omp_outlined__5(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) { 585; 586; 587; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5 588; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { 589; AMDGPU-NEXT: entry: 590; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] 591; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 592; AMDGPU-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] 593; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] 594; AMDGPU-NEXT: ret void 595; 596; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5 597; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { 598; NVPTX-NEXT: entry: 599; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] 600; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 601; NVPTX-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] 602; NVPTX-NEXT: call void @unknown() #[[ATTR7]] 603; NVPTX-NEXT: ret void 604; 605entry: 606 %0 = load i32, ptr %x, align 4, !tbaa !18 607 %inc = add nsw i32 %0, 1 608 store i32 %inc, ptr %x, align 4, !tbaa !18 609 call void @unknown() #11 610 ret void 611} 612 613; Function Attrs: convergent norecurse nounwind 614define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { 615; 616; 617; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper 618; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { 619; AMDGPU-NEXT: entry: 620; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 621; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 622; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 623; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) 624; AMDGPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 625; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] 626; AMDGPU-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]] 627; AMDGPU-NEXT: ret void 628; 629; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper 630; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { 631; NVPTX-NEXT: entry: 632; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 633; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 634; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 635; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) 636; NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 637; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] 638; NVPTX-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]] 639; NVPTX-NEXT: ret void 640; 641entry: 642 %.addr1 = alloca i32, align 4 643 %.zero.addr = alloca i32, align 4 644 %global_args = alloca ptr, align 8 645 store i32 %1, ptr %.addr1, align 4, !tbaa !18 646 store i32 0, ptr %.zero.addr, align 4 647 call void @__kmpc_get_shared_variables(ptr %global_args) 648 %2 = load ptr, ptr %global_args, align 8 649 %3 = load ptr, ptr %2, align 8, !tbaa !26 650 call void @__omp_outlined__5(ptr %.addr1, ptr %.zero.addr, ptr %3) #6 651 ret void 652} 653 654; Function Attrs: alwaysinline convergent norecurse nounwind 655define weak ptx_kernel void @spmd_callees_metadata(ptr %fp) #0 { 656; 657; 658; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees_metadata 659; AMDGPU-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] { 660; AMDGPU-NEXT: entry: 661; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 662; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 663; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null) 664; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 665; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] 666; AMDGPU: common.ret: 667; AMDGPU-NEXT: ret void 668; AMDGPU: user_code.entry: 669; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] 670; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 671; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] 672; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) 673; AMDGPU-NEXT: call void @__kmpc_target_deinit() 674; AMDGPU-NEXT: br label [[COMMON_RET]] 675; 676; NVPTX-LABEL: define {{[^@]+}}@spmd_callees_metadata 677; NVPTX-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] { 678; NVPTX-NEXT: entry: 679; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 680; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 681; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null) 682; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 683; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] 684; NVPTX: common.ret: 685; NVPTX-NEXT: ret void 686; NVPTX: user_code.entry: 687; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] 688; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 689; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] 690; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) 691; NVPTX-NEXT: call void @__kmpc_target_deinit() 692; NVPTX-NEXT: br label [[COMMON_RET]] 693; 694entry: 695 %.zero.addr = alloca i32, align 4 696 %.threadid_temp. = alloca i32, align 4 697 %0 = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null) 698 %exec_user_code = icmp eq i32 %0, -1 699 br i1 %exec_user_code, label %user_code.entry, label %common.ret 700 701common.ret: ; preds = %entry, %user_code.entry 702 ret void 703 704user_code.entry: ; preds = %entry 705 %1 = call i32 @__kmpc_global_thread_num(ptr @1) 706 store i32 0, ptr %.zero.addr, align 4 707 store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18 708 call void %fp(ptr %.threadid_temp., ptr %.zero.addr), !callees !31 709 call void @__kmpc_target_deinit() 710 br label %common.ret 711} 712 713; Function Attrs: alwaysinline convergent norecurse nounwind 714define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 { 715; 716; 717; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callees_metadata 718; AMDGPU-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] { 719; AMDGPU-NEXT: entry: 720; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) 721; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 722; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 723; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null) 724; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 725; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] 726; AMDGPU: is_worker_check: 727; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() 728; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() 729; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] 730; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] 731; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] 732; AMDGPU: worker_state_machine.begin: 733; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) 734; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr 735; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) 736; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 737; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null 738; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] 739; AMDGPU: worker_state_machine.finished: 740; AMDGPU-NEXT: ret void 741; AMDGPU: worker_state_machine.is_active.check: 742; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] 743; AMDGPU: worker_state_machine.parallel_region.fallback.execute: 744; AMDGPU-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) 745; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] 746; AMDGPU: worker_state_machine.parallel_region.end: 747; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() 748; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] 749; AMDGPU: worker_state_machine.done.barrier: 750; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) 751; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] 752; AMDGPU: thread.user_code.check: 753; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 754; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] 755; AMDGPU: common.ret: 756; AMDGPU-NEXT: ret void 757; AMDGPU: user_code.entry: 758; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] 759; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 760; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] 761; AMDGPU-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external 762; AMDGPU-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] 763; AMDGPU: 3: 764; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) 765; AMDGPU-NEXT: br label [[TMP7:%.*]] 766; AMDGPU: 4: 767; AMDGPU-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] 768; AMDGPU: 5: 769; AMDGPU-NEXT: call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) 770; AMDGPU-NEXT: br label [[TMP7]] 771; AMDGPU: 6: 772; AMDGPU-NEXT: unreachable 773; AMDGPU: 7: 774; AMDGPU-NEXT: call void @__kmpc_target_deinit() 775; AMDGPU-NEXT: br label [[COMMON_RET]] 776; 777; NVPTX-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callees_metadata 778; NVPTX-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] { 779; NVPTX-NEXT: entry: 780; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 781; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 782; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 783; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null) 784; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 785; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] 786; NVPTX: is_worker_check: 787; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() 788; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() 789; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] 790; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] 791; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] 792; NVPTX: worker_state_machine.begin: 793; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) 794; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) 795; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 796; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null 797; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] 798; NVPTX: worker_state_machine.finished: 799; NVPTX-NEXT: ret void 800; NVPTX: worker_state_machine.is_active.check: 801; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] 802; NVPTX: worker_state_machine.parallel_region.fallback.execute: 803; NVPTX-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) 804; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] 805; NVPTX: worker_state_machine.parallel_region.end: 806; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() 807; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] 808; NVPTX: worker_state_machine.done.barrier: 809; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) 810; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] 811; NVPTX: thread.user_code.check: 812; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 813; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] 814; NVPTX: common.ret: 815; NVPTX-NEXT: ret void 816; NVPTX: user_code.entry: 817; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] 818; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 819; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] 820; NVPTX-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external 821; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] 822; NVPTX: 3: 823; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) 824; NVPTX-NEXT: br label [[TMP7:%.*]] 825; NVPTX: 4: 826; NVPTX-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] 827; NVPTX: 5: 828; NVPTX-NEXT: call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) 829; NVPTX-NEXT: br label [[TMP7]] 830; NVPTX: 6: 831; NVPTX-NEXT: unreachable 832; NVPTX: 7: 833; NVPTX-NEXT: call void @__kmpc_target_deinit() 834; NVPTX-NEXT: br label [[COMMON_RET]] 835; 836entry: 837 %.zero.addr = alloca i32, align 4 838 %.threadid_temp. = alloca i32, align 4 839 %0 = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null) 840 %exec_user_code = icmp eq i32 %0, -1 841 br i1 %exec_user_code, label %user_code.entry, label %common.ret 842 843common.ret: ; preds = %entry, %user_code.entry 844 ret void 845 846user_code.entry: ; preds = %entry 847 %1 = call i32 @__kmpc_global_thread_num(ptr @1) 848 store i32 0, ptr %.zero.addr, align 4 849 store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18 850 call void %fp(ptr %.threadid_temp., ptr %.zero.addr), !callees !32 851 call void @__kmpc_target_deinit() 852 br label %common.ret 853} 854 855; Function Attrs: alwaysinline convergent norecurse nounwind 856define void @__omp_outlined_spmd_amenable_external(ptr noalias %.global_tid., ptr noalias %.bound_tid.) { 857; 858; 859; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable_external 860; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 861; AMDGPU-NEXT: entry: 862; AMDGPU-NEXT: br label [[FOR_COND:%.*]] 863; AMDGPU: for.cond: 864; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] 865; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 866; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 867; AMDGPU: for.cond.cleanup: 868; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] 869; AMDGPU-NEXT: ret void 870; AMDGPU: for.body: 871; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] 872; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0) 873; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 874; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] 875; 876; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable_external 877; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 878; NVPTX-NEXT: entry: 879; NVPTX-NEXT: br label [[FOR_COND:%.*]] 880; NVPTX: for.cond: 881; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] 882; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 883; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 884; NVPTX: for.cond.cleanup: 885; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] 886; NVPTX-NEXT: ret void 887; NVPTX: for.body: 888; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] 889; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0) 890; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 891; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] 892; 893entry: 894 br label %for.cond 895 896for.cond: ; preds = %for.body, %entry 897 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 898 %cmp = icmp slt i32 %i.0, 100 899 br i1 %cmp, label %for.body, label %for.cond.cleanup 900 901for.cond.cleanup: ; preds = %for.cond 902 call void @spmd_amenable() #10 903 ret void 904 905for.body: ; preds = %for.cond 906 %0 = load i32, ptr %.global_tid., align 4, !tbaa !18 907 call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0) 908 %inc = add nsw i32 %i.0, 1 909 br label %for.cond, !llvm.loop !29 910} 911 912; Function Attrs: alwaysinline convergent norecurse nounwind 913define internal void @__omp_outlined__7(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) { 914; 915; 916; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7 917; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { 918; AMDGPU-NEXT: entry: 919; AMDGPU-NEXT: ret void 920; 921; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7 922; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { 923; NVPTX-NEXT: entry: 924; NVPTX-NEXT: ret void 925; 926entry: 927 ret void 928} 929 930; Function Attrs: convergent norecurse nounwind 931define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { 932; 933; 934; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper 935; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { 936; AMDGPU-NEXT: entry: 937; AMDGPU-NEXT: ret void 938; 939; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper 940; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { 941; NVPTX-NEXT: entry: 942; NVPTX-NEXT: ret void 943; 944entry: 945 ret void 946} 947 948; Function Attrs: alwaysinline convergent norecurse nounwind 949define void @__omp_outlined_not_spmd_amenable_external(ptr noalias %.global_tid., ptr noalias %.bound_tid.) { 950; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable_external 951; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 952; AMDGPU-NEXT: call void @__omp_outlined_not_spmd_amenable(ptr [[DOTGLOBAL_TID_]], ptr [[DOTBOUND_TID_]]) 953; AMDGPU-NEXT: ret void 954; 955; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable_external 956; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 957; NVPTX-NEXT: call void @__omp_outlined_not_spmd_amenable(ptr [[DOTGLOBAL_TID_]], ptr [[DOTBOUND_TID_]]) 958; NVPTX-NEXT: ret void 959; 960 call void @__omp_outlined_not_spmd_amenable(ptr %.global_tid., ptr %.bound_tid.); 961 ret void 962} 963 964define internal void @__omp_outlined_not_spmd_amenable(ptr noalias %.global_tid., ptr noalias %.bound_tid.) { 965; 966; 967; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable 968; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 969; AMDGPU-NEXT: entry: 970; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] 971; AMDGPU-NEXT: ret void 972; 973; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable 974; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { 975; NVPTX-NEXT: entry: 976; NVPTX-NEXT: call void @unknown() #[[ATTR7]] 977; NVPTX-NEXT: ret void 978; 979entry: 980 call void @unknown() #11 981 ret void 982} 983 984; Function Attrs: nosync nounwind 985declare void @__kmpc_free_shared(ptr nocapture, i64) #8 986 987; Function Attrs: nofree nosync nounwind 988declare ptr @__kmpc_alloc_shared(i64) #7 989 990; Function Attrs: convergent 991declare void @use(ptr nocapture) #5 992 993; Function Attrs: convergent 994declare void @unknown() #2 995declare void @unknowni32p(ptr) #2 996 997; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn 998declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 999 1000; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning. 1001define weak i32 @__kmpc_target_init(ptr, ptr) { 1002; 1003; 1004; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init 1005; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) { 1006; AMDGPU-NEXT: ret i32 0 1007; 1008; NVPTX-LABEL: define {{[^@]+}}@__kmpc_target_init 1009; NVPTX-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) { 1010; NVPTX-NEXT: ret i32 0 1011; 1012 ret i32 0 1013} 1014 1015declare void @__kmpc_get_shared_variables(ptr) 1016 1017; Function Attrs: alwaysinline 1018declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64) #4 1019 1020; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn 1021declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 1022 1023; Function Attrs: convergent 1024declare void @spmd_amenable() #5 1025 1026; Function Attrs: nounwind 1027declare i32 @__kmpc_global_thread_num(ptr) #6 1028 1029declare void @__kmpc_target_deinit() 1030 1031 1032; Function Attrs: alwaysinline convergent norecurse nounwind 1033define internal void @__omp_outlined__9(ptr noalias %.global_tid., ptr noalias %.bound_tid.) { 1034; 1035; 1036entry: 1037 call void @unknown() #11 1038 ret void 1039} 1040 1041; Function Attrs: convergent norecurse nounwind 1042define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 { 1043; 1044; 1045entry: 1046 %.addr1 = alloca i32, align 4 1047 %.zero.addr = alloca i32, align 4 1048 %global_args = alloca ptr, align 8 1049 store i32 %1, ptr %.addr1, align 4, !tbaa !18 1050 store i32 0, ptr %.zero.addr, align 4 1051 call void @__kmpc_get_shared_variables(ptr %global_args) 1052 call void @__omp_outlined__9(ptr %.addr1, ptr %.zero.addr) #6 1053 ret void 1054} 1055 1056declare fastcc i32 @__kmpc_get_hardware_thread_id_in_block(); 1057 1058attributes #0 = { alwaysinline convergent norecurse nounwind "kernel" } 1059attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn } 1060attributes #2 = { convergent } 1061attributes #3 = { convergent norecurse nounwind } 1062attributes #4 = { alwaysinline } 1063attributes #5 = { convergent "llvm.assume"="ompx_spmd_amenable" } 1064attributes #6 = { nounwind } 1065attributes #7 = { nofree nosync nounwind } 1066attributes #8 = { nosync nounwind } 1067attributes #9 = { alwaysinline convergent nounwind } 1068attributes #10 = { convergent "llvm.assume"="ompx_spmd_amenable" } 1069attributes #11 = { convergent } 1070 1071!omp_offload.info = !{!0, !1, !2, !3, !4, !5} 1072!llvm.module.flags = !{!12, !13, !14, !15, !16} 1073!llvm.ident = !{!17} 1074 1075!0 = !{i32 0, i32 64770, i32 541341486, !"", i32 74, i32 5} 1076!1 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} 1077!2 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} 1078!3 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} 1079!4 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} 1080!5 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} 1081!12 = !{i32 1, !"wchar_size", i32 4} 1082!13 = !{i32 7, !"openmp", i32 50} 1083!14 = !{i32 7, !"openmp-device", i32 50} 1084!15 = !{i32 8, !"PIC Level", i32 2} 1085!16 = !{i32 7, !"frame-pointer", i32 2} 1086!17 = !{!"clang version 14.0.0"} 1087!18 = !{!19, !19, i64 0} 1088!19 = !{!"int", !20, i64 0} 1089!20 = !{!"omnipotent char", !21, i64 0} 1090!21 = !{!"Simple C/C++ TBAA"} 1091!22 = distinct !{!22, !23, !24} 1092!23 = !{!"llvm.loop.mustprogress"} 1093!24 = !{!"llvm.loop.unroll.disable"} 1094!25 = distinct !{!25, !23, !24} 1095!26 = !{!27, !27, i64 0} 1096!27 = !{!"any pointer", !20, i64 0} 1097!28 = distinct !{!28, !23, !24} 1098!29 = distinct !{!29, !23, !24} 1099!30 = !{!31, !27, i64 0} 1100!31 = !{ptr @__omp_outlined_spmd_amenable_external, ptr @__omp_outlined_not_spmd_amenable} 1101!32 = !{ptr @__omp_outlined_spmd_amenable_external, ptr @__omp_outlined_not_spmd_amenable_external} 1102;. 1103; AMDGPU: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" } 1104; AMDGPU: attributes #[[ATTR1]] = { norecurse } 1105; AMDGPU: attributes #[[ATTR2]] = { convergent norecurse nounwind } 1106; AMDGPU: attributes #[[ATTR3]] = { norecurse nounwind } 1107; AMDGPU: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind } 1108; AMDGPU: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind allocsize(0) } 1109; AMDGPU: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } 1110; AMDGPU: attributes #[[ATTR7]] = { convergent } 1111; AMDGPU: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } 1112; AMDGPU: attributes #[[ATTR9:[0-9]+]] = { alwaysinline } 1113; AMDGPU: attributes #[[ATTR10]] = { nounwind } 1114; AMDGPU: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind } 1115;. 1116; NVPTX: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" } 1117; NVPTX: attributes #[[ATTR1]] = { norecurse } 1118; NVPTX: attributes #[[ATTR2]] = { convergent norecurse nounwind } 1119; NVPTX: attributes #[[ATTR3]] = { norecurse nounwind } 1120; NVPTX: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind } 1121; NVPTX: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind allocsize(0) } 1122; NVPTX: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } 1123; NVPTX: attributes #[[ATTR7]] = { convergent } 1124; NVPTX: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } 1125; NVPTX: attributes #[[ATTR9:[0-9]+]] = { alwaysinline } 1126; NVPTX: attributes #[[ATTR10]] = { nounwind } 1127; NVPTX: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind } 1128;. 1129; AMDGPU: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"", i32 74, i32 5} 1130; AMDGPU: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} 1131; AMDGPU: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} 1132; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} 1133; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} 1134; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} 1135; AMDGPU: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} 1136; AMDGPU: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} 1137; AMDGPU: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} 1138; AMDGPU: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} 1139; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} 1140; AMDGPU: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} 1141; AMDGPU: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} 1142; AMDGPU: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} 1143; AMDGPU: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} 1144; AMDGPU: [[META15]] = !{!"Simple C/C++ TBAA"} 1145; AMDGPU: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} 1146; AMDGPU: [[META17]] = !{!"llvm.loop.mustprogress"} 1147; AMDGPU: [[META18]] = !{!"llvm.loop.unroll.disable"} 1148; AMDGPU: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} 1149; AMDGPU: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} 1150; AMDGPU: [[META21]] = !{!"any pointer", [[META14]], i64 0} 1151; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} 1152; AMDGPU: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} 1153;. 1154; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"", i32 74, i32 5} 1155; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} 1156; NVPTX: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} 1157; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} 1158; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} 1159; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} 1160; NVPTX: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} 1161; NVPTX: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} 1162; NVPTX: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} 1163; NVPTX: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} 1164; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} 1165; NVPTX: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} 1166; NVPTX: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} 1167; NVPTX: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} 1168; NVPTX: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} 1169; NVPTX: [[META15]] = !{!"Simple C/C++ TBAA"} 1170; NVPTX: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} 1171; NVPTX: [[META17]] = !{!"llvm.loop.mustprogress"} 1172; NVPTX: [[META18]] = !{!"llvm.loop.unroll.disable"} 1173; NVPTX: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} 1174; NVPTX: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} 1175; NVPTX: [[META21]] = !{!"any pointer", [[META14]], i64 0} 1176; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} 1177; NVPTX: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} 1178;. 1179