xref: /llvm-project/llvm/test/Transforms/OpenMP/spmdization_indirect.ll (revision 29441e4f5fa5f5c7709f7cf180815ba97f611297)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
2; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU
3; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX
4
5%struct.ident_t = type { i32, i32, i32, i32, ptr }
6%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
7%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
8
9@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
10@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
11@spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
12@spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
13@spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
14@spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
15
16;.
17; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
18; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
19; AMDGPU: @spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
20; AMDGPU: @spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
21; AMDGPU: @spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
22; AMDGPU: @spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
23;.
24; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
25; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
26; NVPTX: @spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
27; NVPTX: @spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
28; NVPTX: @spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
29; NVPTX: @spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
30;.
31define weak ptx_kernel void @spmd_callees(i1 %c) #0 {
32; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees
33; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
34; AMDGPU-NEXT:    call void @spmd_callees__debug(i1 [[C]])
35; AMDGPU-NEXT:    ret void
36;
37; NVPTX-LABEL: define {{[^@]+}}@spmd_callees
38; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
39; NVPTX-NEXT:    call void @spmd_callees__debug(i1 [[C]])
40; NVPTX-NEXT:    ret void
41;
42  call void @spmd_callees__debug(i1 %c)
43  ret void
44}
45
46define internal void @spmd_callees__debug(i1 %c) {
47; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees__debug
48; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
49; AMDGPU-NEXT:  entry:
50; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
51; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
52; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null)
53; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
54; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
55; AMDGPU:       common.ret:
56; AMDGPU-NEXT:    ret void
57; AMDGPU:       user_code.entry:
58; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]]
59; AMDGPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
60; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
61; AMDGPU-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2
62; AMDGPU-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2
63; AMDGPU-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
64; AMDGPU:       3:
65; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
66; AMDGPU-NEXT:    br label [[TMP7:%.*]]
67; AMDGPU:       4:
68; AMDGPU-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
69; AMDGPU:       5:
70; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
71; AMDGPU-NEXT:    br label [[TMP7]]
72; AMDGPU:       6:
73; AMDGPU-NEXT:    unreachable
74; AMDGPU:       7:
75; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
76; AMDGPU-NEXT:    br label [[COMMON_RET]]
77;
78; NVPTX-LABEL: define {{[^@]+}}@spmd_callees__debug
79; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
80; NVPTX-NEXT:  entry:
81; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
82; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
83; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null)
84; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
85; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
86; NVPTX:       common.ret:
87; NVPTX-NEXT:    ret void
88; NVPTX:       user_code.entry:
89; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]]
90; NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
91; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
92; NVPTX-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2
93; NVPTX-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2
94; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
95; NVPTX:       3:
96; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
97; NVPTX-NEXT:    br label [[TMP7:%.*]]
98; NVPTX:       4:
99; NVPTX-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
100; NVPTX:       5:
101; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
102; NVPTX-NEXT:    br label [[TMP7]]
103; NVPTX:       6:
104; NVPTX-NEXT:    unreachable
105; NVPTX:       7:
106; NVPTX-NEXT:    call void @__kmpc_target_deinit()
107; NVPTX-NEXT:    br label [[COMMON_RET]]
108;
109entry:
110  %.zero.addr = alloca i32, align 4
111  %.threadid_temp. = alloca i32, align 4
112  %0 = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null)
113  %exec_user_code = icmp eq i32 %0, -1
114  br i1 %exec_user_code, label %user_code.entry, label %common.ret
115
116common.ret:                                       ; preds = %entry, %user_code.entry
117  ret void
118
119user_code.entry:                                  ; preds = %entry
120  %1 = call i32 @__kmpc_global_thread_num(ptr @1)
121  store i32 0, ptr %.zero.addr, align 4
122  store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18
123  %fp = select i1 %c, ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2
124  call void %fp(ptr %.threadid_temp., ptr %.zero.addr) #6
125  call void @__kmpc_target_deinit()
126  br label %common.ret
127}
128
129; Function Attrs: alwaysinline convergent norecurse nounwind
130define internal void @__omp_outlined_spmd_amenable1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
131;
132;
133; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable1
134; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
135; AMDGPU-NEXT:  entry:
136; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
137; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
138; AMDGPU:       for.cond:
139; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
140; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
141; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
142; AMDGPU:       for.cond.cleanup:
143; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6:[0-9]+]]
144; AMDGPU-NEXT:    ret void
145; AMDGPU:       for.body:
146; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
147; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
148; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
149; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
150;
151; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable1
152; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
153; NVPTX-NEXT:  entry:
154; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
155; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
156; NVPTX:       for.cond:
157; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
158; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
159; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
160; NVPTX:       for.cond.cleanup:
161; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6:[0-9]+]]
162; NVPTX-NEXT:    ret void
163; NVPTX:       for.body:
164; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
165; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
166; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
167; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
168;
169entry:
170  %captured_vars_addrs = alloca [0 x ptr], align 8
171  br label %for.cond
172
173for.cond:                                         ; preds = %for.body, %entry
174  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
175  %cmp = icmp slt i32 %i.0, 100
176  br i1 %cmp, label %for.body, label %for.cond.cleanup
177
178for.cond.cleanup:                                 ; preds = %for.cond
179  call void @spmd_amenable() #10
180  ret void
181
182for.body:                                         ; preds = %for.cond
183  %0 = load i32, ptr %.global_tid., align 4, !tbaa !18
184  call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr %captured_vars_addrs, i64 0)
185  %inc = add nsw i32 %i.0, 1
186  br label %for.cond, !llvm.loop !22
187}
188
189; Function Attrs: alwaysinline convergent norecurse nounwind
190define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
191;
192;
193; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1
194; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
195; AMDGPU-NEXT:  entry:
196; AMDGPU-NEXT:    call void @unknown() #[[ATTR7:[0-9]+]]
197; AMDGPU-NEXT:    ret void
198;
199; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1
200; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
201; NVPTX-NEXT:  entry:
202; NVPTX-NEXT:    call void @unknown() #[[ATTR7:[0-9]+]]
203; NVPTX-NEXT:    ret void
204;
205entry:
206  call void @unknown() #11
207  ret void
208}
209
210; Function Attrs: convergent norecurse nounwind
211define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 {
212;
213;
214; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
215; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
216; AMDGPU-NEXT:  entry:
217; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
218; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
219; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
220; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
221; AMDGPU-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
222; AMDGPU-NEXT:    ret void
223;
224; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
225; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
226; NVPTX-NEXT:  entry:
227; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
228; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
229; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
230; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
231; NVPTX-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
232; NVPTX-NEXT:    ret void
233;
234entry:
235  %.addr1 = alloca i32, align 4
236  %.zero.addr = alloca i32, align 4
237  %global_args = alloca ptr, align 8
238  store i32 %1, ptr %.addr1, align 4, !tbaa !18
239  store i32 0, ptr %.zero.addr, align 4
240  call void @__kmpc_get_shared_variables(ptr %global_args)
241  call void @__omp_outlined__1(ptr %.addr1, ptr %.zero.addr) #6
242  ret void
243}
244
245; Function Attrs: alwaysinline convergent norecurse nounwind
246define internal void @__omp_outlined_spmd_amenable2(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
247;
248;
249; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable2
250; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
251; AMDGPU-NEXT:  entry:
252; AMDGPU-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
253; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
254; AMDGPU-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
255; AMDGPU-NEXT:    call void @use(ptr captures(none) [[MALLOC_CAST]]) #[[ATTR6]]
256; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
257; AMDGPU:       for.cond:
258; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
259; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
260; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
261; AMDGPU:       for.cond.cleanup:
262; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6]]
263; AMDGPU-NEXT:    ret void
264; AMDGPU:       for.body:
265; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
266; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
267; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
268; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
269;
270; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable2
271; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
272; NVPTX-NEXT:  entry:
273; NVPTX-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4
274; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
275; NVPTX-NEXT:    call void @use(ptr captures(none) [[X_H2S]]) #[[ATTR6]]
276; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
277; NVPTX:       for.cond:
278; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
279; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
280; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
281; NVPTX:       for.cond.cleanup:
282; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6]]
283; NVPTX-NEXT:    ret void
284; NVPTX:       for.body:
285; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
286; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
287; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
288; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
289;
290entry:
291  %captured_vars_addrs = alloca [0 x ptr], align 8
292  %x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
293  call void @use(ptr nocapture %x) #10
294  br label %for.cond
295
296for.cond:                                         ; preds = %for.body, %entry
297  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
298  %cmp = icmp slt i32 %i.0, 100
299  br i1 %cmp, label %for.body, label %for.cond.cleanup
300
301for.cond.cleanup:                                 ; preds = %for.cond
302  call void @spmd_amenable() #10
303  call void @__kmpc_free_shared(ptr %x, i64 4)
304  ret void
305
306for.body:                                         ; preds = %for.cond
307  %0 = load i32, ptr %.global_tid., align 4, !tbaa !18
308  call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr %captured_vars_addrs, i64 0)
309  %inc = add nsw i32 %i.0, 1
310  br label %for.cond, !llvm.loop !25
311}
312; Function Attrs: alwaysinline convergent norecurse nounwind
313define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
314;
315;
316; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3
317; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
318; AMDGPU-NEXT:  entry:
319; AMDGPU-NEXT:    call void @unknown() #[[ATTR7]]
320; AMDGPU-NEXT:    ret void
321;
322; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3
323; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
324; NVPTX-NEXT:  entry:
325; NVPTX-NEXT:    call void @unknown() #[[ATTR7]]
326; NVPTX-NEXT:    ret void
327;
328entry:
329  call void @unknown() #11
330  ret void
331}
332
333; Function Attrs: convergent norecurse nounwind
334define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 {
335;
336;
337; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
338; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
339; AMDGPU-NEXT:  entry:
340; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
341; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
342; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
343; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
344; AMDGPU-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
345; AMDGPU-NEXT:    ret void
346;
347; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
348; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
349; NVPTX-NEXT:  entry:
350; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
351; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
352; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
353; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
354; NVPTX-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
355; NVPTX-NEXT:    ret void
356;
357entry:
358  %.addr1 = alloca i32, align 4
359  %.zero.addr = alloca i32, align 4
360  %global_args = alloca ptr, align 8
361  store i32 %1, ptr %.addr1, align 4, !tbaa !18
362  store i32 0, ptr %.zero.addr, align 4
363  call void @__kmpc_get_shared_variables(ptr %global_args)
364  call void @__omp_outlined__3(ptr %.addr1, ptr %.zero.addr) #6
365  ret void
366}
367
368
369; Function Attrs: alwaysinline convergent norecurse nounwind
370define weak ptx_kernel void @spmd_and_non_spmd_callee(i1 %c) #0 {
371;
372;
373; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callee
374; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
375; AMDGPU-NEXT:  entry:
376; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
377; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
378; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
379; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null)
380; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
381; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
382; AMDGPU:       is_worker_check:
383; AMDGPU-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
384; AMDGPU-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
385; AMDGPU-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
386; AMDGPU-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
387; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
388; AMDGPU:       worker_state_machine.begin:
389; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
390; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
391; AMDGPU-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
392; AMDGPU-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
393; AMDGPU-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
394; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
395; AMDGPU:       worker_state_machine.finished:
396; AMDGPU-NEXT:    ret void
397; AMDGPU:       worker_state_machine.is_active.check:
398; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
399; AMDGPU:       worker_state_machine.parallel_region.fallback.execute:
400; AMDGPU-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
401; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
402; AMDGPU:       worker_state_machine.parallel_region.end:
403; AMDGPU-NEXT:    call void @__kmpc_kernel_end_parallel()
404; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
405; AMDGPU:       worker_state_machine.done.barrier:
406; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
407; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
408; AMDGPU:       thread.user_code.check:
409; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
410; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
411; AMDGPU:       common.ret:
412; AMDGPU-NEXT:    ret void
413; AMDGPU:       user_code.entry:
414; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
415; AMDGPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
416; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
417; AMDGPU-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable
418; AMDGPU-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable
419; AMDGPU-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
420; AMDGPU:       3:
421; AMDGPU-NEXT:    call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
422; AMDGPU-NEXT:    br label [[TMP7:%.*]]
423; AMDGPU:       4:
424; AMDGPU-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
425; AMDGPU:       5:
426; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
427; AMDGPU-NEXT:    br label [[TMP7]]
428; AMDGPU:       6:
429; AMDGPU-NEXT:    unreachable
430; AMDGPU:       7:
431; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
432; AMDGPU-NEXT:    br label [[COMMON_RET]]
433;
434; NVPTX-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callee
435; NVPTX-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
436; NVPTX-NEXT:  entry:
437; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
438; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
439; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
440; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null)
441; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
442; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
443; NVPTX:       is_worker_check:
444; NVPTX-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
445; NVPTX-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
446; NVPTX-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
447; NVPTX-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
448; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
449; NVPTX:       worker_state_machine.begin:
450; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
451; NVPTX-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
452; NVPTX-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
453; NVPTX-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
454; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
455; NVPTX:       worker_state_machine.finished:
456; NVPTX-NEXT:    ret void
457; NVPTX:       worker_state_machine.is_active.check:
458; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
459; NVPTX:       worker_state_machine.parallel_region.fallback.execute:
460; NVPTX-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
461; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
462; NVPTX:       worker_state_machine.parallel_region.end:
463; NVPTX-NEXT:    call void @__kmpc_kernel_end_parallel()
464; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
465; NVPTX:       worker_state_machine.done.barrier:
466; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
467; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
468; NVPTX:       thread.user_code.check:
469; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
470; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
471; NVPTX:       common.ret:
472; NVPTX-NEXT:    ret void
473; NVPTX:       user_code.entry:
474; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
475; NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
476; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
477; NVPTX-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable
478; NVPTX-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable
479; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
480; NVPTX:       3:
481; NVPTX-NEXT:    call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
482; NVPTX-NEXT:    br label [[TMP7:%.*]]
483; NVPTX:       4:
484; NVPTX-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
485; NVPTX:       5:
486; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]]
487; NVPTX-NEXT:    br label [[TMP7]]
488; NVPTX:       6:
489; NVPTX-NEXT:    unreachable
490; NVPTX:       7:
491; NVPTX-NEXT:    call void @__kmpc_target_deinit()
492; NVPTX-NEXT:    br label [[COMMON_RET]]
493;
494entry:
495  %.zero.addr = alloca i32, align 4
496  %.threadid_temp. = alloca i32, align 4
497  %0 = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null)
498  %exec_user_code = icmp eq i32 %0, -1
499  br i1 %exec_user_code, label %user_code.entry, label %common.ret
500
501common.ret:                                       ; preds = %entry, %user_code.entry
502  ret void
503
504user_code.entry:                                  ; preds = %entry
505  %1 = call i32 @__kmpc_global_thread_num(ptr @1)
506  store i32 0, ptr %.zero.addr, align 4
507  store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18
508  %fp = select i1 %c, ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable
509  call void %fp(ptr %.threadid_temp., ptr %.zero.addr) #6
510  call void @__kmpc_target_deinit()
511  br label %common.ret
512}
513
514; Function Attrs: alwaysinline convergent norecurse nounwind
515define internal void @__omp_outlined_spmd_amenable3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
516;
517;
518; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable3
519; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
520; AMDGPU-NEXT:  entry:
521; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
522; AMDGPU-NEXT:    [[X:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR10]]
523; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
524; AMDGPU:       for.cond:
525; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
526; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
527; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
528; AMDGPU:       for.cond.cleanup:
529; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6]]
530; AMDGPU-NEXT:    call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]]
531; AMDGPU-NEXT:    ret void
532; AMDGPU:       for.body:
533; AMDGPU-NEXT:    store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
534; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
535; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
536; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
537; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
538;
539; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable3
540; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
541; NVPTX-NEXT:  entry:
542; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
543; NVPTX-NEXT:    [[X:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR10]]
544; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
545; NVPTX:       for.cond:
546; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
547; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
548; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
549; NVPTX:       for.cond.cleanup:
550; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6]]
551; NVPTX-NEXT:    call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]]
552; NVPTX-NEXT:    ret void
553; NVPTX:       for.body:
554; NVPTX-NEXT:    store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
555; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
556; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
557; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
558; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
559;
560entry:
561  %captured_vars_addrs = alloca [1 x ptr], align 8
562  %x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
563  br label %for.cond
564
565for.cond:                                         ; preds = %for.body, %entry
566  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
567  %cmp = icmp slt i32 %i.0, 100
568  br i1 %cmp, label %for.body, label %for.cond.cleanup
569
570for.cond.cleanup:                                 ; preds = %for.cond
571  call void @spmd_amenable() #10
572  call void @__kmpc_free_shared(ptr %x, i64 4)
573  ret void
574
575for.body:                                         ; preds = %for.cond
576  store ptr %x, ptr %captured_vars_addrs, align 8, !tbaa !26
577  %0 = load i32, ptr %.global_tid., align 4, !tbaa !18
578  call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr %captured_vars_addrs, i64 1)
579  %inc = add nsw i32 %i.0, 1
580  br label %for.cond, !llvm.loop !28
581}
582
583; Function Attrs: alwaysinline convergent norecurse nounwind
584define internal void @__omp_outlined__5(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) {
585;
586;
587; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5
588; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
589; AMDGPU-NEXT:  entry:
590; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
591; AMDGPU-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
592; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
593; AMDGPU-NEXT:    call void @unknown() #[[ATTR7]]
594; AMDGPU-NEXT:    ret void
595;
596; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5
597; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
598; NVPTX-NEXT:  entry:
599; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
600; NVPTX-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
601; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
602; NVPTX-NEXT:    call void @unknown() #[[ATTR7]]
603; NVPTX-NEXT:    ret void
604;
605entry:
606  %0 = load i32, ptr %x, align 4, !tbaa !18
607  %inc = add nsw i32 %0, 1
608  store i32 %inc, ptr %x, align 4, !tbaa !18
609  call void @unknown() #11
610  ret void
611}
612
613; Function Attrs: convergent norecurse nounwind
614define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
615;
616;
617; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
618; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
619; AMDGPU-NEXT:  entry:
620; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
621; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
622; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
623; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
624; AMDGPU-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
625; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
626; AMDGPU-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]]
627; AMDGPU-NEXT:    ret void
628;
629; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
630; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
631; NVPTX-NEXT:  entry:
632; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
633; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
634; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
635; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
636; NVPTX-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
637; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
638; NVPTX-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]]
639; NVPTX-NEXT:    ret void
640;
641entry:
642  %.addr1 = alloca i32, align 4
643  %.zero.addr = alloca i32, align 4
644  %global_args = alloca ptr, align 8
645  store i32 %1, ptr %.addr1, align 4, !tbaa !18
646  store i32 0, ptr %.zero.addr, align 4
647  call void @__kmpc_get_shared_variables(ptr %global_args)
648  %2 = load ptr, ptr %global_args, align 8
649  %3 = load ptr, ptr %2, align 8, !tbaa !26
650  call void @__omp_outlined__5(ptr %.addr1, ptr %.zero.addr, ptr %3) #6
651  ret void
652}
653
654; Function Attrs: alwaysinline convergent norecurse nounwind
655define weak ptx_kernel void @spmd_callees_metadata(ptr %fp) #0 {
656;
657;
658; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees_metadata
659; AMDGPU-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] {
660; AMDGPU-NEXT:  entry:
661; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
662; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
663; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null)
664; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
665; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
666; AMDGPU:       common.ret:
667; AMDGPU-NEXT:    ret void
668; AMDGPU:       user_code.entry:
669; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
670; AMDGPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
671; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
672; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]])
673; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
674; AMDGPU-NEXT:    br label [[COMMON_RET]]
675;
676; NVPTX-LABEL: define {{[^@]+}}@spmd_callees_metadata
677; NVPTX-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] {
678; NVPTX-NEXT:  entry:
679; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
680; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
681; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null)
682; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
683; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
684; NVPTX:       common.ret:
685; NVPTX-NEXT:    ret void
686; NVPTX:       user_code.entry:
687; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
688; NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
689; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
690; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]])
691; NVPTX-NEXT:    call void @__kmpc_target_deinit()
692; NVPTX-NEXT:    br label [[COMMON_RET]]
693;
694entry:
695  %.zero.addr = alloca i32, align 4
696  %.threadid_temp. = alloca i32, align 4
697  %0 = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null)
698  %exec_user_code = icmp eq i32 %0, -1
699  br i1 %exec_user_code, label %user_code.entry, label %common.ret
700
701common.ret:                                       ; preds = %entry, %user_code.entry
702  ret void
703
704user_code.entry:                                  ; preds = %entry
705  %1 = call i32 @__kmpc_global_thread_num(ptr @1)
706  store i32 0, ptr %.zero.addr, align 4
707  store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18
708  call void %fp(ptr %.threadid_temp., ptr %.zero.addr), !callees !31
709  call void @__kmpc_target_deinit()
710  br label %common.ret
711}
712
713; Function Attrs: alwaysinline convergent norecurse nounwind
714define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
715;
716;
717; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callees_metadata
718; AMDGPU-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] {
719; AMDGPU-NEXT:  entry:
720; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
721; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
722; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
723; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null)
724; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
725; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
726; AMDGPU:       is_worker_check:
727; AMDGPU-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
728; AMDGPU-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
729; AMDGPU-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
730; AMDGPU-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
731; AMDGPU-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
732; AMDGPU:       worker_state_machine.begin:
733; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
734; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
735; AMDGPU-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
736; AMDGPU-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
737; AMDGPU-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
738; AMDGPU-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
739; AMDGPU:       worker_state_machine.finished:
740; AMDGPU-NEXT:    ret void
741; AMDGPU:       worker_state_machine.is_active.check:
742; AMDGPU-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
743; AMDGPU:       worker_state_machine.parallel_region.fallback.execute:
744; AMDGPU-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
745; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
746; AMDGPU:       worker_state_machine.parallel_region.end:
747; AMDGPU-NEXT:    call void @__kmpc_kernel_end_parallel()
748; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
749; AMDGPU:       worker_state_machine.done.barrier:
750; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
751; AMDGPU-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
752; AMDGPU:       thread.user_code.check:
753; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
754; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
755; AMDGPU:       common.ret:
756; AMDGPU-NEXT:    ret void
757; AMDGPU:       user_code.entry:
758; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
759; AMDGPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
760; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
761; AMDGPU-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external
762; AMDGPU-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
763; AMDGPU:       3:
764; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]])
765; AMDGPU-NEXT:    br label [[TMP7:%.*]]
766; AMDGPU:       4:
767; AMDGPU-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
768; AMDGPU:       5:
769; AMDGPU-NEXT:    call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]])
770; AMDGPU-NEXT:    br label [[TMP7]]
771; AMDGPU:       6:
772; AMDGPU-NEXT:    unreachable
773; AMDGPU:       7:
774; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
775; AMDGPU-NEXT:    br label [[COMMON_RET]]
776;
777; NVPTX-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callees_metadata
778; NVPTX-SAME: (ptr [[FP:%.*]]) #[[ATTR0]] {
779; NVPTX-NEXT:  entry:
780; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
781; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
782; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
783; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null)
784; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
785; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
786; NVPTX:       is_worker_check:
787; NVPTX-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
788; NVPTX-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
789; NVPTX-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
790; NVPTX-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
791; NVPTX-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
792; NVPTX:       worker_state_machine.begin:
793; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
794; NVPTX-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
795; NVPTX-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
796; NVPTX-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
797; NVPTX-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
798; NVPTX:       worker_state_machine.finished:
799; NVPTX-NEXT:    ret void
800; NVPTX:       worker_state_machine.is_active.check:
801; NVPTX-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
802; NVPTX:       worker_state_machine.parallel_region.fallback.execute:
803; NVPTX-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
804; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
805; NVPTX:       worker_state_machine.parallel_region.end:
806; NVPTX-NEXT:    call void @__kmpc_kernel_end_parallel()
807; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
808; NVPTX:       worker_state_machine.done.barrier:
809; NVPTX-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
810; NVPTX-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
811; NVPTX:       thread.user_code.check:
812; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
813; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
814; NVPTX:       common.ret:
815; NVPTX-NEXT:    ret void
816; NVPTX:       user_code.entry:
817; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
818; NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
819; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
820; NVPTX-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external
821; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
822; NVPTX:       3:
823; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]])
824; NVPTX-NEXT:    br label [[TMP7:%.*]]
825; NVPTX:       4:
826; NVPTX-NEXT:    br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]]
827; NVPTX:       5:
828; NVPTX-NEXT:    call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]])
829; NVPTX-NEXT:    br label [[TMP7]]
830; NVPTX:       6:
831; NVPTX-NEXT:    unreachable
832; NVPTX:       7:
833; NVPTX-NEXT:    call void @__kmpc_target_deinit()
834; NVPTX-NEXT:    br label [[COMMON_RET]]
835;
836entry:
837  %.zero.addr = alloca i32, align 4
838  %.threadid_temp. = alloca i32, align 4
839  %0 = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null)
840  %exec_user_code = icmp eq i32 %0, -1
841  br i1 %exec_user_code, label %user_code.entry, label %common.ret
842
843common.ret:                                       ; preds = %entry, %user_code.entry
844  ret void
845
846user_code.entry:                                  ; preds = %entry
847  %1 = call i32 @__kmpc_global_thread_num(ptr @1)
848  store i32 0, ptr %.zero.addr, align 4
849  store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18
850  call void %fp(ptr %.threadid_temp., ptr %.zero.addr), !callees !32
851  call void @__kmpc_target_deinit()
852  br label %common.ret
853}
854
855; Function Attrs: alwaysinline convergent norecurse nounwind
856define void @__omp_outlined_spmd_amenable_external(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
857;
858;
859; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable_external
860; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
861; AMDGPU-NEXT:  entry:
862; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
863; AMDGPU:       for.cond:
864; AMDGPU-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
865; AMDGPU-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
866; AMDGPU-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
867; AMDGPU:       for.cond.cleanup:
868; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6]]
869; AMDGPU-NEXT:    ret void
870; AMDGPU:       for.body:
871; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
872; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0)
873; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
874; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
875;
876; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable_external
877; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
878; NVPTX-NEXT:  entry:
879; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
880; NVPTX:       for.cond:
881; NVPTX-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
882; NVPTX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
883; NVPTX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
884; NVPTX:       for.cond.cleanup:
885; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6]]
886; NVPTX-NEXT:    ret void
887; NVPTX:       for.body:
888; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
889; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0)
890; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
891; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
892;
893entry:
894  br label %for.cond
895
896for.cond:                                         ; preds = %for.body, %entry
897  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
898  %cmp = icmp slt i32 %i.0, 100
899  br i1 %cmp, label %for.body, label %for.cond.cleanup
900
901for.cond.cleanup:                                 ; preds = %for.cond
902  call void @spmd_amenable() #10
903  ret void
904
905for.body:                                         ; preds = %for.cond
906  %0 = load i32, ptr %.global_tid., align 4, !tbaa !18
907  call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0)
908  %inc = add nsw i32 %i.0, 1
909  br label %for.cond, !llvm.loop !29
910}
911
912; Function Attrs: alwaysinline convergent norecurse nounwind
913define internal void @__omp_outlined__7(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) {
914;
915;
916; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7
917; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
918; AMDGPU-NEXT:  entry:
919; AMDGPU-NEXT:    ret void
920;
921; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7
922; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
923; NVPTX-NEXT:  entry:
924; NVPTX-NEXT:    ret void
925;
926entry:
927  ret void
928}
929
930; Function Attrs: convergent norecurse nounwind
931define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 {
932;
933;
934; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
935; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
936; AMDGPU-NEXT:  entry:
937; AMDGPU-NEXT:    ret void
938;
939; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
940; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
941; NVPTX-NEXT:  entry:
942; NVPTX-NEXT:    ret void
943;
944entry:
945  ret void
946}
947
948; Function Attrs: alwaysinline convergent norecurse nounwind
949define void @__omp_outlined_not_spmd_amenable_external(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
950; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable_external
951; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
952; AMDGPU-NEXT:    call void @__omp_outlined_not_spmd_amenable(ptr [[DOTGLOBAL_TID_]], ptr [[DOTBOUND_TID_]])
953; AMDGPU-NEXT:    ret void
954;
955; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable_external
956; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
957; NVPTX-NEXT:    call void @__omp_outlined_not_spmd_amenable(ptr [[DOTGLOBAL_TID_]], ptr [[DOTBOUND_TID_]])
958; NVPTX-NEXT:    ret void
959;
960  call void @__omp_outlined_not_spmd_amenable(ptr %.global_tid., ptr %.bound_tid.);
961  ret void
962}
963
964define internal void @__omp_outlined_not_spmd_amenable(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
965;
966;
967; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable
968; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
969; AMDGPU-NEXT:  entry:
970; AMDGPU-NEXT:    call void @unknown() #[[ATTR7]]
971; AMDGPU-NEXT:    ret void
972;
973; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_not_spmd_amenable
974; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
975; NVPTX-NEXT:  entry:
976; NVPTX-NEXT:    call void @unknown() #[[ATTR7]]
977; NVPTX-NEXT:    ret void
978;
979entry:
980  call void @unknown() #11
981  ret void
982}
983
984; Function Attrs: nosync nounwind
985declare void @__kmpc_free_shared(ptr nocapture, i64) #8
986
987; Function Attrs: nofree nosync nounwind
988declare ptr @__kmpc_alloc_shared(i64) #7
989
990; Function Attrs: convergent
991declare void @use(ptr nocapture) #5
992
993; Function Attrs: convergent
994declare void @unknown() #2
995declare void @unknowni32p(ptr) #2
996
997; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
998declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
999
1000; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
1001define weak i32 @__kmpc_target_init(ptr, ptr) {
1002;
1003;
1004; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init
1005; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
1006; AMDGPU-NEXT:    ret i32 0
1007;
1008; NVPTX-LABEL: define {{[^@]+}}@__kmpc_target_init
1009; NVPTX-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
1010; NVPTX-NEXT:    ret i32 0
1011;
1012  ret i32 0
1013}
1014
1015declare void @__kmpc_get_shared_variables(ptr)
1016
1017; Function Attrs: alwaysinline
1018declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64) #4
1019
1020; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
1021declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
1022
1023; Function Attrs: convergent
1024declare void @spmd_amenable() #5
1025
1026; Function Attrs: nounwind
1027declare i32 @__kmpc_global_thread_num(ptr) #6
1028
1029declare void @__kmpc_target_deinit()
1030
1031
1032; Function Attrs: alwaysinline convergent norecurse nounwind
1033define internal void @__omp_outlined__9(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
1034;
1035;
1036entry:
1037  call void @unknown() #11
1038  ret void
1039}
1040
1041; Function Attrs: convergent norecurse nounwind
1042define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 {
1043;
1044;
1045entry:
1046  %.addr1 = alloca i32, align 4
1047  %.zero.addr = alloca i32, align 4
1048  %global_args = alloca ptr, align 8
1049  store i32 %1, ptr %.addr1, align 4, !tbaa !18
1050  store i32 0, ptr %.zero.addr, align 4
1051  call void @__kmpc_get_shared_variables(ptr %global_args)
1052  call void @__omp_outlined__9(ptr %.addr1, ptr %.zero.addr) #6
1053  ret void
1054}
1055
1056declare fastcc i32 @__kmpc_get_hardware_thread_id_in_block();
1057
1058attributes #0 = { alwaysinline convergent norecurse nounwind "kernel" }
1059attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn }
1060attributes #2 = { convergent }
1061attributes #3 = { convergent norecurse nounwind }
1062attributes #4 = { alwaysinline }
1063attributes #5 = { convergent "llvm.assume"="ompx_spmd_amenable" }
1064attributes #6 = { nounwind }
1065attributes #7 = { nofree nosync nounwind }
1066attributes #8 = { nosync nounwind }
1067attributes #9 = { alwaysinline convergent nounwind }
1068attributes #10 = { convergent "llvm.assume"="ompx_spmd_amenable" }
1069attributes #11 = { convergent }
1070
1071!omp_offload.info = !{!0, !1, !2, !3, !4, !5}
1072!llvm.module.flags = !{!12, !13, !14, !15, !16}
1073!llvm.ident = !{!17}
1074
1075!0 = !{i32 0, i32 64770, i32 541341486, !"", i32 74, i32 5}
1076!1 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
1077!2 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
1078!3 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
1079!4 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
1080!5 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
1081!12 = !{i32 1, !"wchar_size", i32 4}
1082!13 = !{i32 7, !"openmp", i32 50}
1083!14 = !{i32 7, !"openmp-device", i32 50}
1084!15 = !{i32 8, !"PIC Level", i32 2}
1085!16 = !{i32 7, !"frame-pointer", i32 2}
1086!17 = !{!"clang version 14.0.0"}
1087!18 = !{!19, !19, i64 0}
1088!19 = !{!"int", !20, i64 0}
1089!20 = !{!"omnipotent char", !21, i64 0}
1090!21 = !{!"Simple C/C++ TBAA"}
1091!22 = distinct !{!22, !23, !24}
1092!23 = !{!"llvm.loop.mustprogress"}
1093!24 = !{!"llvm.loop.unroll.disable"}
1094!25 = distinct !{!25, !23, !24}
1095!26 = !{!27, !27, i64 0}
1096!27 = !{!"any pointer", !20, i64 0}
1097!28 = distinct !{!28, !23, !24}
1098!29 = distinct !{!29, !23, !24}
1099!30 = !{!31, !27, i64 0}
1100!31 = !{ptr @__omp_outlined_spmd_amenable_external, ptr @__omp_outlined_not_spmd_amenable}
1101!32 = !{ptr @__omp_outlined_spmd_amenable_external, ptr @__omp_outlined_not_spmd_amenable_external}
1102;.
1103; AMDGPU: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" }
1104; AMDGPU: attributes #[[ATTR1]] = { norecurse }
1105; AMDGPU: attributes #[[ATTR2]] = { convergent norecurse nounwind }
1106; AMDGPU: attributes #[[ATTR3]] = { norecurse nounwind }
1107; AMDGPU: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind }
1108; AMDGPU: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
1109; AMDGPU: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
1110; AMDGPU: attributes #[[ATTR7]] = { convergent }
1111; AMDGPU: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
1112; AMDGPU: attributes #[[ATTR9:[0-9]+]] = { alwaysinline }
1113; AMDGPU: attributes #[[ATTR10]] = { nounwind }
1114; AMDGPU: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind }
1115;.
1116; NVPTX: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" }
1117; NVPTX: attributes #[[ATTR1]] = { norecurse }
1118; NVPTX: attributes #[[ATTR2]] = { convergent norecurse nounwind }
1119; NVPTX: attributes #[[ATTR3]] = { norecurse nounwind }
1120; NVPTX: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind }
1121; NVPTX: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
1122; NVPTX: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
1123; NVPTX: attributes #[[ATTR7]] = { convergent }
1124; NVPTX: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
1125; NVPTX: attributes #[[ATTR9:[0-9]+]] = { alwaysinline }
1126; NVPTX: attributes #[[ATTR10]] = { nounwind }
1127; NVPTX: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind }
1128;.
1129; AMDGPU: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"", i32 74, i32 5}
1130; AMDGPU: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
1131; AMDGPU: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
1132; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
1133; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
1134; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
1135; AMDGPU: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
1136; AMDGPU: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
1137; AMDGPU: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
1138; AMDGPU: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
1139; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
1140; AMDGPU: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
1141; AMDGPU: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
1142; AMDGPU: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
1143; AMDGPU: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
1144; AMDGPU: [[META15]] = !{!"Simple C/C++ TBAA"}
1145; AMDGPU: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
1146; AMDGPU: [[META17]] = !{!"llvm.loop.mustprogress"}
1147; AMDGPU: [[META18]] = !{!"llvm.loop.unroll.disable"}
1148; AMDGPU: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
1149; AMDGPU: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
1150; AMDGPU: [[META21]] = !{!"any pointer", [[META14]], i64 0}
1151; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
1152; AMDGPU: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
1153;.
1154; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"", i32 74, i32 5}
1155; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
1156; NVPTX: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
1157; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
1158; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
1159; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
1160; NVPTX: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
1161; NVPTX: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
1162; NVPTX: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
1163; NVPTX: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
1164; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
1165; NVPTX: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
1166; NVPTX: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
1167; NVPTX: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
1168; NVPTX: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
1169; NVPTX: [[META15]] = !{!"Simple C/C++ TBAA"}
1170; NVPTX: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
1171; NVPTX: [[META17]] = !{!"llvm.loop.mustprogress"}
1172; NVPTX: [[META18]] = !{!"llvm.loop.unroll.disable"}
1173; NVPTX: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
1174; NVPTX: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
1175; NVPTX: [[META21]] = !{!"any pointer", [[META14]], i64 0}
1176; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
1177; NVPTX: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
1178;.
1179