xref: /llvm-project/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp (revision 4e3310a81391fbc283d263715a68d8732e73d01d)
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // REQUIRES: amdgpu-registered-target
3 
4 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
5 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=IR-GPU
6 
7 // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
8 
9 // Check same results after serialization round-trip
10 // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-pch -o %t %s
11 // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR-PCH
12 
13 // expected-no-diagnostics
14 
15 #ifndef HEADER
16 #define HEADER
17 int foo() {
18   int i;
19   int j;
20   int sum[10][10];
21 
22   #pragma omp target teams loop reduction(+:sum) collapse(2) \
23       bind(parallel) order(concurrent) lastprivate(j) map(tofrom:sum)
24   for(i=0; i<10; i++)
25     for(j=0; j<10; j++)
26       sum[i][j] += i;
27 
28   return 0;
29 }
30 #endif
31 // IR-PCH-HOST-LABEL: define {{[^@]+}}@_Z3foov
32 // IR-PCH-HOST-SAME: () #[[ATTR0:[0-9]+]] {
33 // IR-PCH-HOST-NEXT:  entry:
34 // IR-PCH-HOST-NEXT:    [[I:%.*]] = alloca i32, align 4
35 // IR-PCH-HOST-NEXT:    [[J:%.*]] = alloca i32, align 4
36 // IR-PCH-HOST-NEXT:    [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
37 // IR-PCH-HOST-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
38 // IR-PCH-HOST-NEXT:    [[TMP0:%.*]] = load i32, ptr [[J]], align 4
39 // IR-PCH-HOST-NEXT:    store i32 [[TMP0]], ptr [[J_CASTED]], align 4
40 // IR-PCH-HOST-NEXT:    [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
41 // IR-PCH-HOST-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
42 // IR-PCH-HOST-NEXT:    ret i32 0
43 // IR-PCH-HOST-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
44 // IR-PCH-HOST-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
45 // IR-PCH-HOST-NEXT:  entry:
46 // IR-PCH-HOST-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
47 // IR-PCH-HOST-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
48 // IR-PCH-HOST-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
49 // IR-PCH-HOST-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
50 // IR-PCH-HOST-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
51 // IR-PCH-HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
52 // IR-PCH-HOST-NEXT:    [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
53 // IR-PCH-HOST-NEXT:    store i32 [[TMP1]], ptr [[J_CASTED]], align 4
54 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
55 // IR-PCH-HOST-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @.omp_outlined., i64 [[TMP2]], ptr [[TMP0]])
56 // IR-PCH-HOST-NEXT:    ret void
57 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp_outlined.
58 // IR-PCH-HOST-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
59 // IR-PCH-HOST-NEXT:  entry:
60 // IR-PCH-HOST-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
61 // IR-PCH-HOST-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
62 // IR-PCH-HOST-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
63 // IR-PCH-HOST-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
64 // IR-PCH-HOST-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
65 // IR-PCH-HOST-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
66 // IR-PCH-HOST-NEXT:    [[TMP:%.*]] = alloca i32, align 4
67 // IR-PCH-HOST-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
68 // IR-PCH-HOST-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
69 // IR-PCH-HOST-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
70 // IR-PCH-HOST-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
71 // IR-PCH-HOST-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
72 // IR-PCH-HOST-NEXT:    [[J3:%.*]] = alloca i32, align 4
73 // IR-PCH-HOST-NEXT:    [[I:%.*]] = alloca i32, align 4
74 // IR-PCH-HOST-NEXT:    [[J4:%.*]] = alloca i32, align 4
75 // IR-PCH-HOST-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
76 // IR-PCH-HOST-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
77 // IR-PCH-HOST-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
78 // IR-PCH-HOST-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
79 // IR-PCH-HOST-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
80 // IR-PCH-HOST-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
81 // IR-PCH-HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
82 // IR-PCH-HOST-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
83 // IR-PCH-HOST-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
84 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
85 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
86 // IR-PCH-HOST:       omp.arrayinit.body:
87 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
88 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
89 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
90 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
91 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
92 // IR-PCH-HOST:       omp.arrayinit.done:
93 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
94 // IR-PCH-HOST-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
95 // IR-PCH-HOST-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
96 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
97 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
98 // IR-PCH-HOST-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
99 // IR-PCH-HOST-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
100 // IR-PCH-HOST-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
101 // IR-PCH-HOST-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
102 // IR-PCH-HOST-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
103 // IR-PCH-HOST:       cond.true:
104 // IR-PCH-HOST-NEXT:    br label [[COND_END:%.*]]
105 // IR-PCH-HOST:       cond.false:
106 // IR-PCH-HOST-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
107 // IR-PCH-HOST-NEXT:    br label [[COND_END]]
108 // IR-PCH-HOST:       cond.end:
109 // IR-PCH-HOST-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
110 // IR-PCH-HOST-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
111 // IR-PCH-HOST-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
112 // IR-PCH-HOST-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
113 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
114 // IR-PCH-HOST:       omp.inner.for.cond:
115 // IR-PCH-HOST-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
116 // IR-PCH-HOST-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
117 // IR-PCH-HOST-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
118 // IR-PCH-HOST-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
119 // IR-PCH-HOST:       omp.inner.for.body:
120 // IR-PCH-HOST-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
121 // IR-PCH-HOST-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
122 // IR-PCH-HOST-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
123 // IR-PCH-HOST-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
124 // IR-PCH-HOST-NEXT:    [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
125 // IR-PCH-HOST-NEXT:    store i32 [[TMP13]], ptr [[J_CASTED]], align 4
126 // IR-PCH-HOST-NEXT:    [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
127 // IR-PCH-HOST-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @.omp_outlined..1, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
128 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
129 // IR-PCH-HOST:       omp.inner.for.inc:
130 // IR-PCH-HOST-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
131 // IR-PCH-HOST-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
132 // IR-PCH-HOST-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
133 // IR-PCH-HOST-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
134 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_COND]]
135 // IR-PCH-HOST:       omp.inner.for.end:
136 // IR-PCH-HOST-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
137 // IR-PCH-HOST:       omp.loop.exit:
138 // IR-PCH-HOST-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
139 // IR-PCH-HOST-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
140 // IR-PCH-HOST-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
141 // IR-PCH-HOST-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
142 // IR-PCH-HOST-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
143 // IR-PCH-HOST-NEXT:    br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
144 // IR-PCH-HOST:       .omp.lastprivate.then:
145 // IR-PCH-HOST-NEXT:    store i32 10, ptr [[J3]], align 4
146 // IR-PCH-HOST-NEXT:    [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
147 // IR-PCH-HOST-NEXT:    store i32 [[TMP21]], ptr [[J_ADDR]], align 4
148 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
149 // IR-PCH-HOST:       .omp.lastprivate.done:
150 // IR-PCH-HOST-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
151 // IR-PCH-HOST-NEXT:    store ptr [[SUM1]], ptr [[TMP22]], align 8
152 // IR-PCH-HOST-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
153 // IR-PCH-HOST-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
154 // IR-PCH-HOST-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @.omp.reduction.reduction_func.2, ptr @.gomp_critical_user_.reduction.var)
155 // IR-PCH-HOST-NEXT:    switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
156 // IR-PCH-HOST-NEXT:    i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
157 // IR-PCH-HOST-NEXT:    i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
158 // IR-PCH-HOST-NEXT:    ]
159 // IR-PCH-HOST:       .omp.reduction.case1:
160 // IR-PCH-HOST-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
161 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
162 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
163 // IR-PCH-HOST:       omp.arraycpy.body:
164 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
165 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
166 // IR-PCH-HOST-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
167 // IR-PCH-HOST-NEXT:    [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
168 // IR-PCH-HOST-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
169 // IR-PCH-HOST-NEXT:    store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
170 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
171 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
172 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
173 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
174 // IR-PCH-HOST:       omp.arraycpy.done10:
175 // IR-PCH-HOST-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
176 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
177 // IR-PCH-HOST:       .omp.reduction.case2:
178 // IR-PCH-HOST-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
179 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
180 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
181 // IR-PCH-HOST:       omp.arraycpy.body12:
182 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
183 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
184 // IR-PCH-HOST-NEXT:    [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
185 // IR-PCH-HOST-NEXT:    [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
186 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
187 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
188 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
189 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
190 // IR-PCH-HOST:       omp.arraycpy.done18:
191 // IR-PCH-HOST-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
192 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
193 // IR-PCH-HOST:       .omp.reduction.default:
194 // IR-PCH-HOST-NEXT:    ret void
195 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp_outlined..1
196 // IR-PCH-HOST-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
197 // IR-PCH-HOST-NEXT:  entry:
198 // IR-PCH-HOST-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
199 // IR-PCH-HOST-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
200 // IR-PCH-HOST-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
201 // IR-PCH-HOST-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
202 // IR-PCH-HOST-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
203 // IR-PCH-HOST-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
204 // IR-PCH-HOST-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
205 // IR-PCH-HOST-NEXT:    [[TMP:%.*]] = alloca i32, align 4
206 // IR-PCH-HOST-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
207 // IR-PCH-HOST-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
208 // IR-PCH-HOST-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
209 // IR-PCH-HOST-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
210 // IR-PCH-HOST-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
211 // IR-PCH-HOST-NEXT:    [[J3:%.*]] = alloca i32, align 4
212 // IR-PCH-HOST-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
213 // IR-PCH-HOST-NEXT:    [[I:%.*]] = alloca i32, align 4
214 // IR-PCH-HOST-NEXT:    [[J5:%.*]] = alloca i32, align 4
215 // IR-PCH-HOST-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
216 // IR-PCH-HOST-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
217 // IR-PCH-HOST-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
218 // IR-PCH-HOST-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
219 // IR-PCH-HOST-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
220 // IR-PCH-HOST-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
221 // IR-PCH-HOST-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
222 // IR-PCH-HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
223 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
224 // IR-PCH-HOST-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
225 // IR-PCH-HOST-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
226 // IR-PCH-HOST-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
227 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
228 // IR-PCH-HOST-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
229 // IR-PCH-HOST-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
230 // IR-PCH-HOST-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
231 // IR-PCH-HOST-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
232 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
233 // IR-PCH-HOST-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
234 // IR-PCH-HOST-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
235 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
236 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
237 // IR-PCH-HOST:       omp.arrayinit.body:
238 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
239 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
240 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
241 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
242 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
243 // IR-PCH-HOST:       omp.arrayinit.done:
244 // IR-PCH-HOST-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
245 // IR-PCH-HOST-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
246 // IR-PCH-HOST-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
247 // IR-PCH-HOST-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
248 // IR-PCH-HOST-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
249 // IR-PCH-HOST-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
250 // IR-PCH-HOST:       cond.true:
251 // IR-PCH-HOST-NEXT:    br label [[COND_END:%.*]]
252 // IR-PCH-HOST:       cond.false:
253 // IR-PCH-HOST-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
254 // IR-PCH-HOST-NEXT:    br label [[COND_END]]
255 // IR-PCH-HOST:       cond.end:
256 // IR-PCH-HOST-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
257 // IR-PCH-HOST-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
258 // IR-PCH-HOST-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
259 // IR-PCH-HOST-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
260 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
261 // IR-PCH-HOST:       omp.inner.for.cond:
262 // IR-PCH-HOST-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
263 // IR-PCH-HOST-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
264 // IR-PCH-HOST-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
265 // IR-PCH-HOST-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
266 // IR-PCH-HOST:       omp.inner.for.body:
267 // IR-PCH-HOST-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
268 // IR-PCH-HOST-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
269 // IR-PCH-HOST-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
270 // IR-PCH-HOST-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
271 // IR-PCH-HOST-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
272 // IR-PCH-HOST-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
273 // IR-PCH-HOST-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
274 // IR-PCH-HOST-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
275 // IR-PCH-HOST-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
276 // IR-PCH-HOST-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
277 // IR-PCH-HOST-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
278 // IR-PCH-HOST-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
279 // IR-PCH-HOST-NEXT:    store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
280 // IR-PCH-HOST-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
281 // IR-PCH-HOST-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
282 // IR-PCH-HOST-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
283 // IR-PCH-HOST-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
284 // IR-PCH-HOST-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
285 // IR-PCH-HOST-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
286 // IR-PCH-HOST-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
287 // IR-PCH-HOST-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
288 // IR-PCH-HOST-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
289 // IR-PCH-HOST-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
290 // IR-PCH-HOST-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
291 // IR-PCH-HOST:       omp.body.continue:
292 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
293 // IR-PCH-HOST:       omp.inner.for.inc:
294 // IR-PCH-HOST-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
295 // IR-PCH-HOST-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
296 // IR-PCH-HOST-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
297 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
298 // IR-PCH-HOST:       omp.inner.for.end:
299 // IR-PCH-HOST-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
300 // IR-PCH-HOST:       omp.loop.exit:
301 // IR-PCH-HOST-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
302 // IR-PCH-HOST-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
303 // IR-PCH-HOST-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
304 // IR-PCH-HOST-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
305 // IR-PCH-HOST-NEXT:    store ptr [[SUM4]], ptr [[TMP21]], align 8
306 // IR-PCH-HOST-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
307 // IR-PCH-HOST-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
308 // IR-PCH-HOST-NEXT:    [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
309 // IR-PCH-HOST-NEXT:    switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
310 // IR-PCH-HOST-NEXT:    i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
311 // IR-PCH-HOST-NEXT:    i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
312 // IR-PCH-HOST-NEXT:    ]
313 // IR-PCH-HOST:       .omp.reduction.case1:
314 // IR-PCH-HOST-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
315 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
316 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
317 // IR-PCH-HOST:       omp.arraycpy.body:
318 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
319 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
320 // IR-PCH-HOST-NEXT:    [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
321 // IR-PCH-HOST-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
322 // IR-PCH-HOST-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
323 // IR-PCH-HOST-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
324 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
325 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
326 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
327 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
328 // IR-PCH-HOST:       omp.arraycpy.done19:
329 // IR-PCH-HOST-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
330 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
331 // IR-PCH-HOST:       .omp.reduction.case2:
332 // IR-PCH-HOST-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
333 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
334 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
335 // IR-PCH-HOST:       omp.arraycpy.body21:
336 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
337 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
338 // IR-PCH-HOST-NEXT:    [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
339 // IR-PCH-HOST-NEXT:    [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
340 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
341 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
342 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
343 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
344 // IR-PCH-HOST:       omp.arraycpy.done27:
345 // IR-PCH-HOST-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
346 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
347 // IR-PCH-HOST:       .omp.reduction.default:
348 // IR-PCH-HOST-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
349 // IR-PCH-HOST-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
350 // IR-PCH-HOST-NEXT:    br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
351 // IR-PCH-HOST:       .omp.lastprivate.then:
352 // IR-PCH-HOST-NEXT:    store i32 10, ptr [[J3]], align 4
353 // IR-PCH-HOST-NEXT:    [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
354 // IR-PCH-HOST-NEXT:    store i32 [[TMP33]], ptr [[J_ADDR]], align 4
355 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
356 // IR-PCH-HOST:       .omp.lastprivate.done:
357 // IR-PCH-HOST-NEXT:    ret void
358 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
359 // IR-PCH-HOST-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
360 // IR-PCH-HOST-NEXT:  entry:
361 // IR-PCH-HOST-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
362 // IR-PCH-HOST-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
363 // IR-PCH-HOST-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
364 // IR-PCH-HOST-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
365 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
366 // IR-PCH-HOST-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
367 // IR-PCH-HOST-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
368 // IR-PCH-HOST-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
369 // IR-PCH-HOST-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
370 // IR-PCH-HOST-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
371 // IR-PCH-HOST-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
372 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
373 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
374 // IR-PCH-HOST:       omp.arraycpy.body:
375 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
376 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
377 // IR-PCH-HOST-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
378 // IR-PCH-HOST-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
379 // IR-PCH-HOST-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
380 // IR-PCH-HOST-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
381 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
382 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
383 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
384 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
385 // IR-PCH-HOST:       omp.arraycpy.done2:
386 // IR-PCH-HOST-NEXT:    ret void
387 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func.2
388 // IR-PCH-HOST-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
389 // IR-PCH-HOST-NEXT:  entry:
390 // IR-PCH-HOST-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
391 // IR-PCH-HOST-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
392 // IR-PCH-HOST-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
393 // IR-PCH-HOST-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
394 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
395 // IR-PCH-HOST-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
396 // IR-PCH-HOST-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
397 // IR-PCH-HOST-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
398 // IR-PCH-HOST-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
399 // IR-PCH-HOST-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
400 // IR-PCH-HOST-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
401 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
402 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
403 // IR-PCH-HOST:       omp.arraycpy.body:
404 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
405 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
406 // IR-PCH-HOST-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
407 // IR-PCH-HOST-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
408 // IR-PCH-HOST-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
409 // IR-PCH-HOST-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
410 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
411 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
412 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
413 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
414 // IR-PCH-HOST:       omp.arraycpy.done2:
415 // IR-PCH-HOST-NEXT:    ret void
416 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l23
417 // CHECK-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
418 // CHECK-NEXT:  entry:
419 // CHECK-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
420 // CHECK-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
421 // CHECK-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
422 // CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
423 // CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
424 // CHECK-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
425 // CHECK-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
426 // CHECK-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
427 // CHECK-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
428 // CHECK-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
429 // CHECK-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
430 // CHECK-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
431 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
432 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 2, i1 false)
433 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
434 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
435 // CHECK:       user_code.entry:
436 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
437 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4
438 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4
439 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
440 // CHECK-NEXT:    store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
441 // CHECK-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
442 // CHECK-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
443 // CHECK-NEXT:    call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
444 // CHECK-NEXT:    ret void
445 // CHECK:       worker.exit:
446 // CHECK-NEXT:    ret void
447 // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
448 // CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
449 // CHECK-NEXT:  entry:
450 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
451 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
452 // CHECK-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
453 // CHECK-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
454 // CHECK-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
455 // CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
456 // CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
457 // CHECK-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4, addrspace(5)
458 // CHECK-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4, addrspace(5)
459 // CHECK-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4, addrspace(5)
460 // CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
461 // CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
462 // CHECK-NEXT:    [[J3:%.*]] = alloca i32, align 4, addrspace(5)
463 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
464 // CHECK-NEXT:    [[J4:%.*]] = alloca i32, align 4, addrspace(5)
465 // CHECK-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
466 // CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
467 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
468 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
469 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
470 // CHECK-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
471 // CHECK-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
472 // CHECK-NEXT:    [[SUM1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM1]] to ptr
473 // CHECK-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
474 // CHECK-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
475 // CHECK-NEXT:    [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP2]] to ptr
476 // CHECK-NEXT:    [[DOTOMP_COMB_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB]] to ptr
477 // CHECK-NEXT:    [[DOTOMP_COMB_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB]] to ptr
478 // CHECK-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
479 // CHECK-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
480 // CHECK-NEXT:    [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
481 // CHECK-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
482 // CHECK-NEXT:    [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
483 // CHECK-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
484 // CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
485 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
486 // CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
487 // CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
488 // CHECK-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
489 // CHECK-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
490 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
491 // CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1_ASCAST]], i32 0, i32 0, i32 0
492 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
493 // CHECK-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
494 // CHECK-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
495 // CHECK:       omp.arrayinit.body:
496 // CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
497 // CHECK-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
498 // CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
499 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
500 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
501 // CHECK:       omp.arrayinit.done:
502 // CHECK-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
503 // CHECK-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
504 // CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
505 // CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
506 // CHECK-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
507 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
508 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
509 // CHECK-NEXT:    call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP3]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]])
510 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
511 // CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
512 // CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
513 // CHECK:       cond.true:
514 // CHECK-NEXT:    br label [[COND_END:%.*]]
515 // CHECK:       cond.false:
516 // CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
517 // CHECK-NEXT:    br label [[COND_END]]
518 // CHECK:       cond.end:
519 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
520 // CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
521 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
522 // CHECK-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
523 // CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
524 // CHECK:       omp.inner.for.cond:
525 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
526 // CHECK-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP7]], 100
527 // CHECK-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
528 // CHECK:       omp.inner.for.body:
529 // CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
530 // CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
531 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
532 // CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
533 // CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
534 // CHECK-NEXT:    store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4
535 // CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
536 // CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
537 // CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr
538 // CHECK-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8
539 // CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
540 // CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to ptr
541 // CHECK-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 8
542 // CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
543 // CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP13]] to ptr
544 // CHECK-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 8
545 // CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3
546 // CHECK-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8
547 // CHECK-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
548 // CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
549 // CHECK-NEXT:    call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP22]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__.1, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4)
550 // CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
551 // CHECK:       omp.inner.for.inc:
552 // CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
553 // CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
554 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
555 // CHECK-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV_ASCAST]], align 4
556 // CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
557 // CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
558 // CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
559 // CHECK-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
560 // CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
561 // CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
562 // CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
563 // CHECK-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
564 // CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
565 // CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[TMP29]], 99
566 // CHECK-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
567 // CHECK:       cond.true9:
568 // CHECK-NEXT:    br label [[COND_END11:%.*]]
569 // CHECK:       cond.false10:
570 // CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
571 // CHECK-NEXT:    br label [[COND_END11]]
572 // CHECK:       cond.end11:
573 // CHECK-NEXT:    [[COND12:%.*]] = phi i32 [ 99, [[COND_TRUE9]] ], [ [[TMP30]], [[COND_FALSE10]] ]
574 // CHECK-NEXT:    store i32 [[COND12]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
575 // CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
576 // CHECK-NEXT:    store i32 [[TMP31]], ptr [[DOTOMP_IV_ASCAST]], align 4
577 // CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]]
578 // CHECK:       omp.inner.for.end:
579 // CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
580 // CHECK:       omp.loop.exit:
581 // CHECK-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
582 // CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
583 // CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP33]])
584 // CHECK-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
585 // CHECK-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
586 // CHECK-NEXT:    br i1 [[TMP35]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
587 // CHECK:       .omp.lastprivate.then:
588 // CHECK-NEXT:    store i32 10, ptr [[J3_ASCAST]], align 4
589 // CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
590 // CHECK-NEXT:    store i32 [[TMP36]], ptr [[J_ADDR_ASCAST]], align 4
591 // CHECK-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
592 // CHECK:       .omp.lastprivate.done:
593 // CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
594 // CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
595 // CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
596 // CHECK-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
597 // CHECK-NEXT:    [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8
598 // CHECK-NEXT:    [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.3, ptr @_omp_reduction_inter_warp_copy_func.4, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
599 // CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1
600 // CHECK-NEXT:    br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
601 // CHECK:       .omp.reduction.then:
602 // CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
603 // CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]]
604 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
605 // CHECK:       omp.arraycpy.body:
606 // CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
607 // CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
608 // CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
609 // CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
610 // CHECK-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]]
611 // CHECK-NEXT:    store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
612 // CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
613 // CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
614 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
615 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
616 // CHECK:       omp.arraycpy.done17:
617 // CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
618 // CHECK:       .omp.reduction.done:
619 // CHECK-NEXT:    ret void
620 // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__.1
621 // CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
622 // CHECK-NEXT:  entry:
623 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
624 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
625 // CHECK-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
626 // CHECK-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
627 // CHECK-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
628 // CHECK-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
629 // CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
630 // CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
631 // CHECK-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4, addrspace(5)
632 // CHECK-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
633 // CHECK-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
634 // CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
635 // CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
636 // CHECK-NEXT:    [[J3:%.*]] = alloca i32, align 4, addrspace(5)
637 // CHECK-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
638 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
639 // CHECK-NEXT:    [[J5:%.*]] = alloca i32, align 4, addrspace(5)
640 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
641 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
642 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
643 // CHECK-NEXT:    [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
644 // CHECK-NEXT:    [[DOTPREVIOUS_UB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_UB__ADDR]] to ptr
645 // CHECK-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
646 // CHECK-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
647 // CHECK-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
648 // CHECK-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
649 // CHECK-NEXT:    [[TMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP1]] to ptr
650 // CHECK-NEXT:    [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
651 // CHECK-NEXT:    [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
652 // CHECK-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
653 // CHECK-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
654 // CHECK-NEXT:    [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
655 // CHECK-NEXT:    [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
656 // CHECK-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
657 // CHECK-NEXT:    [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
658 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
659 // CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
660 // CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
661 // CHECK-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
662 // CHECK-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
663 // CHECK-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
664 // CHECK-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
665 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
666 // CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
667 // CHECK-NEXT:    store i32 99, ptr [[DOTOMP_UB_ASCAST]], align 4
668 // CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
669 // CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
670 // CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
671 // CHECK-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
672 // CHECK-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB_ASCAST]], align 4
673 // CHECK-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB_ASCAST]], align 4
674 // CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
675 // CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
676 // CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i32 0, i32 0, i32 0
677 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
678 // CHECK-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
679 // CHECK-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
680 // CHECK:       omp.arrayinit.body:
681 // CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
682 // CHECK-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
683 // CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
684 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
685 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
686 // CHECK:       omp.arrayinit.done:
687 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
688 // CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
689 // CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP5]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
690 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
691 // CHECK-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
692 // CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
693 // CHECK:       omp.inner.for.cond:
694 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
695 // CHECK-NEXT:    [[CONV6:%.*]] = sext i32 [[TMP7]] to i64
696 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8, !llvm.access.group [[ACC_GRP7]]
697 // CHECK-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV6]], [[TMP8]]
698 // CHECK-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
699 // CHECK:       omp.inner.for.body:
700 // CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
701 // CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP9]], 10
702 // CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
703 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
704 // CHECK-NEXT:    store i32 [[ADD]], ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
705 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
706 // CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
707 // CHECK-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP11]], 10
708 // CHECK-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
709 // CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP10]], [[MUL8]]
710 // CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
711 // CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
712 // CHECK-NEXT:    store i32 [[ADD10]], ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
713 // CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
714 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
715 // CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
716 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i64 0, i64 [[IDXPROM]]
717 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
718 // CHECK-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP14]] to i64
719 // CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
720 // CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
721 // CHECK-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP15]], [[TMP12]]
722 // CHECK-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
723 // CHECK-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
724 // CHECK:       omp.body.continue:
725 // CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
726 // CHECK:       omp.inner.for.inc:
727 // CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
728 // CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
729 // CHECK-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
730 // CHECK-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
731 // CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
732 // CHECK:       omp.inner.for.end:
733 // CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
734 // CHECK:       omp.loop.exit:
735 // CHECK-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
736 // CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
737 // CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP19]])
738 // CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
739 // CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
740 // CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
741 // CHECK-NEXT:    store ptr [[SUM4_ASCAST]], ptr [[TMP22]], align 8
742 // CHECK-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP21]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
743 // CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP23]], 1
744 // CHECK-NEXT:    br i1 [[TMP24]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
745 // CHECK:       .omp.reduction.then:
746 // CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
747 // CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
748 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
749 // CHECK:       omp.arraycpy.body:
750 // CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
751 // CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
752 // CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
753 // CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
754 // CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
755 // CHECK-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
756 // CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
757 // CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
758 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
759 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
760 // CHECK:       omp.arraycpy.done19:
761 // CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
762 // CHECK:       .omp.reduction.done:
763 // CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
764 // CHECK-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
765 // CHECK-NEXT:    br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
766 // CHECK:       .omp.lastprivate.then:
767 // CHECK-NEXT:    store i32 10, ptr [[J3_ASCAST]], align 4
768 // CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
769 // CHECK-NEXT:    store i32 [[TMP30]], ptr [[J_ADDR_ASCAST]], align 4
770 // CHECK-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
771 // CHECK:       .omp.lastprivate.done:
772 // CHECK-NEXT:    ret void
773 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
774 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
775 // CHECK-NEXT:  entry:
776 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
777 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
778 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
779 // CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
780 // CHECK-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
781 // CHECK-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
782 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
783 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
784 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
785 // CHECK-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
786 // CHECK-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
787 // CHECK-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
788 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
789 // CHECK-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
790 // CHECK-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
791 // CHECK-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
792 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
793 // CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
794 // CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
795 // CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
796 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
797 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
798 // CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
799 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
800 // CHECK-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
801 // CHECK:       .shuffle.pre_cond:
802 // CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
803 // CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
804 // CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
805 // CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
806 // CHECK-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
807 // CHECK-NEXT:    [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
808 // CHECK-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
809 // CHECK-NEXT:    br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
810 // CHECK:       .shuffle.then:
811 // CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
812 // CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
813 // CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
814 // CHECK-NEXT:    [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
815 // CHECK-NEXT:    store i64 [[TMP22]], ptr [[TMP13]], align 4
816 // CHECK-NEXT:    [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
817 // CHECK-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
818 // CHECK-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
819 // CHECK:       .shuffle.exit:
820 // CHECK-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
821 // CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
822 // CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
823 // CHECK-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
824 // CHECK-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
825 // CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
826 // CHECK-NEXT:    [[TMP30:%.*]] = and i16 [[TMP5]], 1
827 // CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
828 // CHECK-NEXT:    [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
829 // CHECK-NEXT:    [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
830 // CHECK-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
831 // CHECK-NEXT:    [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
832 // CHECK-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
833 // CHECK-NEXT:    br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
834 // CHECK:       then:
835 // CHECK-NEXT:    call void @"_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
836 // CHECK-NEXT:    br label [[IFCONT:%.*]]
837 // CHECK:       else:
838 // CHECK-NEXT:    br label [[IFCONT]]
839 // CHECK:       ifcont:
840 // CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
841 // CHECK-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
842 // CHECK-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
843 // CHECK-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
844 // CHECK:       then4:
845 // CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
846 // CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
847 // CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
848 // CHECK-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
849 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
850 // CHECK-NEXT:    br label [[IFCONT6:%.*]]
851 // CHECK:       else5:
852 // CHECK-NEXT:    br label [[IFCONT6]]
853 // CHECK:       ifcont6:
854 // CHECK-NEXT:    ret void
855 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
856 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
857 // CHECK-NEXT:  entry:
858 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
859 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
860 // CHECK-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
861 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
862 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
863 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
864 // CHECK-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
865 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
866 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
867 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
868 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
869 // CHECK-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
870 // CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
871 // CHECK-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
872 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
873 // CHECK-NEXT:    store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
874 // CHECK-NEXT:    br label [[PRECOND:%.*]]
875 // CHECK:       precond:
876 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
877 // CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
878 // CHECK-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
879 // CHECK:       body:
880 // CHECK-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
881 // CHECK-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
882 // CHECK-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
883 // CHECK:       then:
884 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
885 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
886 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
887 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
888 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
889 // CHECK-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
890 // CHECK-NEXT:    br label [[IFCONT:%.*]]
891 // CHECK:       else:
892 // CHECK-NEXT:    br label [[IFCONT]]
893 // CHECK:       ifcont:
894 // CHECK-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
895 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
896 // CHECK-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
897 // CHECK-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
898 // CHECK:       then2:
899 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
900 // CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
901 // CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
902 // CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
903 // CHECK-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
904 // CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
905 // CHECK-NEXT:    br label [[IFCONT4:%.*]]
906 // CHECK:       else3:
907 // CHECK-NEXT:    br label [[IFCONT4]]
908 // CHECK:       ifcont4:
909 // CHECK-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
910 // CHECK-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
911 // CHECK-NEXT:    br label [[PRECOND]]
912 // CHECK:       exit:
913 // CHECK-NEXT:    ret void
914 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func.3
915 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
916 // CHECK-NEXT:  entry:
917 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
918 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
919 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
920 // CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
921 // CHECK-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
922 // CHECK-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
923 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
924 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
925 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
926 // CHECK-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
927 // CHECK-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
928 // CHECK-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
929 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
930 // CHECK-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
931 // CHECK-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
932 // CHECK-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
933 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
934 // CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
935 // CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
936 // CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
937 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
938 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
939 // CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
940 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
941 // CHECK-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
942 // CHECK:       .shuffle.pre_cond:
943 // CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
944 // CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
945 // CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
946 // CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
947 // CHECK-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
948 // CHECK-NEXT:    [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
949 // CHECK-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
950 // CHECK-NEXT:    br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
951 // CHECK:       .shuffle.then:
952 // CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
953 // CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
954 // CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
955 // CHECK-NEXT:    [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
956 // CHECK-NEXT:    store i64 [[TMP22]], ptr [[TMP13]], align 4
957 // CHECK-NEXT:    [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
958 // CHECK-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
959 // CHECK-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
960 // CHECK:       .shuffle.exit:
961 // CHECK-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
962 // CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
963 // CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
964 // CHECK-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
965 // CHECK-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
966 // CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
967 // CHECK-NEXT:    [[TMP30:%.*]] = and i16 [[TMP5]], 1
968 // CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
969 // CHECK-NEXT:    [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
970 // CHECK-NEXT:    [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
971 // CHECK-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
972 // CHECK-NEXT:    [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
973 // CHECK-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
974 // CHECK-NEXT:    br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
975 // CHECK:       then:
976 // CHECK-NEXT:    call void @"_omp$reduction$reduction_func.2"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
977 // CHECK-NEXT:    br label [[IFCONT:%.*]]
978 // CHECK:       else:
979 // CHECK-NEXT:    br label [[IFCONT]]
980 // CHECK:       ifcont:
981 // CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
982 // CHECK-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
983 // CHECK-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
984 // CHECK-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
985 // CHECK:       then4:
986 // CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
987 // CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
988 // CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
989 // CHECK-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
990 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
991 // CHECK-NEXT:    br label [[IFCONT6:%.*]]
992 // CHECK:       else5:
993 // CHECK-NEXT:    br label [[IFCONT6]]
994 // CHECK:       ifcont6:
995 // CHECK-NEXT:    ret void
996 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func.4
997 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
998 // CHECK-NEXT:  entry:
999 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1000 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1001 // CHECK-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1002 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
1003 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1004 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1005 // CHECK-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
1006 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1007 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1008 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1009 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1010 // CHECK-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
1011 // CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1012 // CHECK-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
1013 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1014 // CHECK-NEXT:    store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1015 // CHECK-NEXT:    br label [[PRECOND:%.*]]
1016 // CHECK:       precond:
1017 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1018 // CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
1019 // CHECK-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1020 // CHECK:       body:
1021 // CHECK-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1022 // CHECK-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1023 // CHECK-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1024 // CHECK:       then:
1025 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1026 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
1027 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1028 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1029 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1030 // CHECK-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1031 // CHECK-NEXT:    br label [[IFCONT:%.*]]
1032 // CHECK:       else:
1033 // CHECK-NEXT:    br label [[IFCONT]]
1034 // CHECK:       ifcont:
1035 // CHECK-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1036 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1037 // CHECK-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1038 // CHECK-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1039 // CHECK:       then2:
1040 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1041 // CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1042 // CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1043 // CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1044 // CHECK-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1045 // CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
1046 // CHECK-NEXT:    br label [[IFCONT4:%.*]]
1047 // CHECK:       else3:
1048 // CHECK-NEXT:    br label [[IFCONT4]]
1049 // CHECK:       ifcont4:
1050 // CHECK-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1051 // CHECK-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
1052 // CHECK-NEXT:    br label [[PRECOND]]
1053 // CHECK:       exit:
1054 // CHECK-NEXT:    ret void
1055 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
1056 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1057 // CHECK-NEXT:  entry:
1058 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1059 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1060 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1061 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1062 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1063 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1064 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1065 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1066 // CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1067 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1068 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1069 // CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1070 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1071 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1072 // CHECK-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1073 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1074 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[TMP8]], ptr align 4 [[TMP7]], i64 400, i1 false)
1075 // CHECK-NEXT:    ret void
1076 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
1077 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1078 // CHECK-NEXT:  entry:
1079 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1080 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1081 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1082 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1083 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1084 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1085 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1086 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1087 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1088 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1089 // CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1090 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1091 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1092 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1093 // CHECK-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1094 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1095 // CHECK-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
1096 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1097 // CHECK-NEXT:    call void @"_omp$reduction$reduction_func.2"(ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr [[TMP7]]) #[[ATTR2]]
1098 // CHECK-NEXT:    ret void
1099 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
1100 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1101 // CHECK-NEXT:  entry:
1102 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1103 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1104 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1105 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1106 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1107 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1108 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1109 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1110 // CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1111 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1112 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1113 // CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1114 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1115 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1116 // CHECK-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1117 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1118 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 128 [[TMP8]], i64 400, i1 false)
1119 // CHECK-NEXT:    ret void
1120 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
1121 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1122 // CHECK-NEXT:  entry:
1123 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1124 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1125 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1126 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1127 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1128 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1129 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1130 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1131 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1132 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1133 // CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1134 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1135 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1136 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1137 // CHECK-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1138 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1139 // CHECK-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
1140 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1141 // CHECK-NEXT:    call void @"_omp$reduction$reduction_func.2"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]]) #[[ATTR2]]
1142 // CHECK-NEXT:    ret void
1143 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
1144 // IR-GPU-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
1145 // IR-GPU-NEXT:  entry:
1146 // IR-GPU-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1147 // IR-GPU-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1148 // IR-GPU-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1149 // IR-GPU-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
1150 // IR-GPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1151 // IR-GPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
1152 // IR-GPU-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
1153 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
1154 // IR-GPU-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
1155 // IR-GPU-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
1156 // IR-GPU-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
1157 // IR-GPU-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
1158 // IR-GPU-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
1159 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
1160 // IR-GPU-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
1161 // IR-GPU-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
1162 // IR-GPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_kernel_environment to ptr), ptr [[DYN_PTR]])
1163 // IR-GPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
1164 // IR-GPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1165 // IR-GPU:       user_code.entry:
1166 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
1167 // IR-GPU-NEXT:    [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4
1168 // IR-GPU-NEXT:    store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4
1169 // IR-GPU-NEXT:    [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
1170 // IR-GPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
1171 // IR-GPU-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
1172 // IR-GPU-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
1173 // IR-GPU-NEXT:    call void @__kmpc_target_deinit()
1174 // IR-GPU-NEXT:    ret void
1175 // IR-GPU:       worker.exit:
1176 // IR-GPU-NEXT:    ret void
1177 //
1178 //
1179 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined
1180 // IR-GPU-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
1181 // IR-GPU-NEXT:  entry:
1182 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1183 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1184 // IR-GPU-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1185 // IR-GPU-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1186 // IR-GPU-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1187 // IR-GPU-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
1188 // IR-GPU-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
1189 // IR-GPU-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4, addrspace(5)
1190 // IR-GPU-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4, addrspace(5)
1191 // IR-GPU-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4, addrspace(5)
1192 // IR-GPU-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
1193 // IR-GPU-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
1194 // IR-GPU-NEXT:    [[J3:%.*]] = alloca i32, align 4, addrspace(5)
1195 // IR-GPU-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
1196 // IR-GPU-NEXT:    [[J4:%.*]] = alloca i32, align 4, addrspace(5)
1197 // IR-GPU-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
1198 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
1199 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1200 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
1201 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
1202 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
1203 // IR-GPU-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
1204 // IR-GPU-NEXT:    [[SUM1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM1]] to ptr
1205 // IR-GPU-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
1206 // IR-GPU-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
1207 // IR-GPU-NEXT:    [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP2]] to ptr
1208 // IR-GPU-NEXT:    [[DOTOMP_COMB_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB]] to ptr
1209 // IR-GPU-NEXT:    [[DOTOMP_COMB_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB]] to ptr
1210 // IR-GPU-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
1211 // IR-GPU-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
1212 // IR-GPU-NEXT:    [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
1213 // IR-GPU-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
1214 // IR-GPU-NEXT:    [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
1215 // IR-GPU-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
1216 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
1217 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1218 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1219 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
1220 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
1221 // IR-GPU-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
1222 // IR-GPU-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
1223 // IR-GPU-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1_ASCAST]], i32 0, i32 0, i32 0
1224 // IR-GPU-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
1225 // IR-GPU-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
1226 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
1227 // IR-GPU:       omp.arrayinit.body:
1228 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
1229 // IR-GPU-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
1230 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
1231 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
1232 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
1233 // IR-GPU:       omp.arrayinit.done:
1234 // IR-GPU-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1235 // IR-GPU-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1236 // IR-GPU-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1237 // IR-GPU-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1238 // IR-GPU-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1239 // IR-GPU-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1240 // IR-GPU-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
1241 // IR-GPU-NEXT:    call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP3]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]])
1242 // IR-GPU-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1243 // IR-GPU-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
1244 // IR-GPU-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
1245 // IR-GPU:       cond.true:
1246 // IR-GPU-NEXT:    br label [[COND_END:%.*]]
1247 // IR-GPU:       cond.false:
1248 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1249 // IR-GPU-NEXT:    br label [[COND_END]]
1250 // IR-GPU:       cond.end:
1251 // IR-GPU-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
1252 // IR-GPU-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1253 // IR-GPU-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1254 // IR-GPU-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
1255 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
1256 // IR-GPU:       omp.inner.for.cond:
1257 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
1258 // IR-GPU-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP7]], 100
1259 // IR-GPU-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
1260 // IR-GPU:       omp.inner.for.body:
1261 // IR-GPU-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1262 // IR-GPU-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
1263 // IR-GPU-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1264 // IR-GPU-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
1265 // IR-GPU-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
1266 // IR-GPU-NEXT:    store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4
1267 // IR-GPU-NEXT:    [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
1268 // IR-GPU-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
1269 // IR-GPU-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr
1270 // IR-GPU-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8
1271 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
1272 // IR-GPU-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to ptr
1273 // IR-GPU-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 8
1274 // IR-GPU-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
1275 // IR-GPU-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP13]] to ptr
1276 // IR-GPU-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 8
1277 // IR-GPU-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3
1278 // IR-GPU-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8
1279 // IR-GPU-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1280 // IR-GPU-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
1281 // IR-GPU-NEXT:    call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP22]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4)
1282 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
1283 // IR-GPU:       omp.inner.for.inc:
1284 // IR-GPU-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
1285 // IR-GPU-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1286 // IR-GPU-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
1287 // IR-GPU-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV_ASCAST]], align 4
1288 // IR-GPU-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1289 // IR-GPU-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1290 // IR-GPU-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
1291 // IR-GPU-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1292 // IR-GPU-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1293 // IR-GPU-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1294 // IR-GPU-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
1295 // IR-GPU-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1296 // IR-GPU-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1297 // IR-GPU-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[TMP29]], 99
1298 // IR-GPU-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
1299 // IR-GPU:       cond.true9:
1300 // IR-GPU-NEXT:    br label [[COND_END11:%.*]]
1301 // IR-GPU:       cond.false10:
1302 // IR-GPU-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1303 // IR-GPU-NEXT:    br label [[COND_END11]]
1304 // IR-GPU:       cond.end11:
1305 // IR-GPU-NEXT:    [[COND12:%.*]] = phi i32 [ 99, [[COND_TRUE9]] ], [ [[TMP30]], [[COND_FALSE10]] ]
1306 // IR-GPU-NEXT:    store i32 [[COND12]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1307 // IR-GPU-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1308 // IR-GPU-NEXT:    store i32 [[TMP31]], ptr [[DOTOMP_IV_ASCAST]], align 4
1309 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_COND]]
1310 // IR-GPU:       omp.inner.for.end:
1311 // IR-GPU-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
1312 // IR-GPU:       omp.loop.exit:
1313 // IR-GPU-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1314 // IR-GPU-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
1315 // IR-GPU-NEXT:    call void @__kmpc_distribute_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP33]])
1316 // IR-GPU-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1317 // IR-GPU-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
1318 // IR-GPU-NEXT:    br i1 [[TMP35]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
1319 // IR-GPU:       .omp.lastprivate.then:
1320 // IR-GPU-NEXT:    store i32 10, ptr [[J3_ASCAST]], align 4
1321 // IR-GPU-NEXT:    [[TMP36:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
1322 // IR-GPU-NEXT:    store i32 [[TMP36]], ptr [[J_ADDR_ASCAST]], align 4
1323 // IR-GPU-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
1324 // IR-GPU:       .omp.lastprivate.done:
1325 // IR-GPU-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1326 // IR-GPU-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP37]], align 8
1327 // IR-GPU-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
1328 // IR-GPU-NEXT:    [[TMP38:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, i64 400, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
1329 // IR-GPU-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[TMP38]], 1
1330 // IR-GPU-NEXT:    br i1 [[TMP39]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1331 // IR-GPU:       .omp.reduction.then:
1332 // IR-GPU-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
1333 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP40]]
1334 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
1335 // IR-GPU:       omp.arraycpy.body:
1336 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1337 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1338 // IR-GPU-NEXT:    [[TMP41:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
1339 // IR-GPU-NEXT:    [[TMP42:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
1340 // IR-GPU-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP41]], [[TMP42]]
1341 // IR-GPU-NEXT:    store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
1342 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
1343 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
1344 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP40]]
1345 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
1346 // IR-GPU:       omp.arraycpy.done17:
1347 // IR-GPU-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
1348 // IR-GPU:       .omp.reduction.done:
1349 // IR-GPU-NEXT:    ret void
1350 //
1351 //
1352 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined
1353 // IR-GPU-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
1354 // IR-GPU-NEXT:  entry:
1355 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1356 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1357 // IR-GPU-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1358 // IR-GPU-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1359 // IR-GPU-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1360 // IR-GPU-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1361 // IR-GPU-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
1362 // IR-GPU-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
1363 // IR-GPU-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4, addrspace(5)
1364 // IR-GPU-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
1365 // IR-GPU-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
1366 // IR-GPU-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
1367 // IR-GPU-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
1368 // IR-GPU-NEXT:    [[J3:%.*]] = alloca i32, align 4, addrspace(5)
1369 // IR-GPU-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1370 // IR-GPU-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
1371 // IR-GPU-NEXT:    [[J5:%.*]] = alloca i32, align 4, addrspace(5)
1372 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1373 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
1374 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
1375 // IR-GPU-NEXT:    [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
1376 // IR-GPU-NEXT:    [[DOTPREVIOUS_UB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_UB__ADDR]] to ptr
1377 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
1378 // IR-GPU-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
1379 // IR-GPU-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
1380 // IR-GPU-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
1381 // IR-GPU-NEXT:    [[TMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP1]] to ptr
1382 // IR-GPU-NEXT:    [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
1383 // IR-GPU-NEXT:    [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
1384 // IR-GPU-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
1385 // IR-GPU-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
1386 // IR-GPU-NEXT:    [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
1387 // IR-GPU-NEXT:    [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
1388 // IR-GPU-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
1389 // IR-GPU-NEXT:    [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
1390 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1391 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1392 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
1393 // IR-GPU-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
1394 // IR-GPU-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
1395 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
1396 // IR-GPU-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
1397 // IR-GPU-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
1398 // IR-GPU-NEXT:    store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
1399 // IR-GPU-NEXT:    store i32 99, ptr [[DOTOMP_UB_ASCAST]], align 4
1400 // IR-GPU-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
1401 // IR-GPU-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
1402 // IR-GPU-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
1403 // IR-GPU-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
1404 // IR-GPU-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB_ASCAST]], align 4
1405 // IR-GPU-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB_ASCAST]], align 4
1406 // IR-GPU-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1407 // IR-GPU-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1408 // IR-GPU-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i32 0, i32 0, i32 0
1409 // IR-GPU-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
1410 // IR-GPU-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
1411 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
1412 // IR-GPU:       omp.arrayinit.body:
1413 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
1414 // IR-GPU-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
1415 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
1416 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
1417 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
1418 // IR-GPU:       omp.arrayinit.done:
1419 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1420 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
1421 // IR-GPU-NEXT:    call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP5]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
1422 // IR-GPU-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
1423 // IR-GPU-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
1424 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
1425 // IR-GPU:       omp.inner.for.cond:
1426 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
1427 // IR-GPU-NEXT:    [[CONV6:%.*]] = sext i32 [[TMP7]] to i64
1428 // IR-GPU-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8, !llvm.access.group [[ACC_GRP7]]
1429 // IR-GPU-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV6]], [[TMP8]]
1430 // IR-GPU-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
1431 // IR-GPU:       omp.inner.for.body:
1432 // IR-GPU-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1433 // IR-GPU-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP9]], 10
1434 // IR-GPU-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
1435 // IR-GPU-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
1436 // IR-GPU-NEXT:    store i32 [[ADD]], ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1437 // IR-GPU-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1438 // IR-GPU-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1439 // IR-GPU-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP11]], 10
1440 // IR-GPU-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
1441 // IR-GPU-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP10]], [[MUL8]]
1442 // IR-GPU-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
1443 // IR-GPU-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
1444 // IR-GPU-NEXT:    store i32 [[ADD10]], ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1445 // IR-GPU-NEXT:    [[TMP12:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1446 // IR-GPU-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1447 // IR-GPU-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
1448 // IR-GPU-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i64 0, i64 [[IDXPROM]]
1449 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1450 // IR-GPU-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP14]] to i64
1451 // IR-GPU-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
1452 // IR-GPU-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
1453 // IR-GPU-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP15]], [[TMP12]]
1454 // IR-GPU-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
1455 // IR-GPU-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
1456 // IR-GPU:       omp.body.continue:
1457 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
1458 // IR-GPU:       omp.inner.for.inc:
1459 // IR-GPU-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1460 // IR-GPU-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1461 // IR-GPU-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
1462 // IR-GPU-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1463 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
1464 // IR-GPU:       omp.inner.for.end:
1465 // IR-GPU-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
1466 // IR-GPU:       omp.loop.exit:
1467 // IR-GPU-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1468 // IR-GPU-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
1469 // IR-GPU-NEXT:    call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP19]])
1470 // IR-GPU-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1471 // IR-GPU-NEXT:    store ptr [[SUM4_ASCAST]], ptr [[TMP20]], align 8
1472 // IR-GPU-NEXT:    [[TMP21:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i64 400, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
1473 // IR-GPU-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP21]], 1
1474 // IR-GPU-NEXT:    br i1 [[TMP22]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1475 // IR-GPU:       .omp.reduction.then:
1476 // IR-GPU-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
1477 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP23]]
1478 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
1479 // IR-GPU:       omp.arraycpy.body:
1480 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1481 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1482 // IR-GPU-NEXT:    [[TMP24:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
1483 // IR-GPU-NEXT:    [[TMP25:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
1484 // IR-GPU-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
1485 // IR-GPU-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
1486 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
1487 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
1488 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP23]]
1489 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
1490 // IR-GPU:       omp.arraycpy.done19:
1491 // IR-GPU-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
1492 // IR-GPU:       .omp.reduction.done:
1493 // IR-GPU-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1494 // IR-GPU-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
1495 // IR-GPU-NEXT:    br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
1496 // IR-GPU:       .omp.lastprivate.then:
1497 // IR-GPU-NEXT:    store i32 10, ptr [[J3_ASCAST]], align 4
1498 // IR-GPU-NEXT:    [[TMP28:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
1499 // IR-GPU-NEXT:    store i32 [[TMP28]], ptr [[J_ADDR_ASCAST]], align 4
1500 // IR-GPU-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
1501 // IR-GPU:       .omp.lastprivate.done:
1502 // IR-GPU-NEXT:    ret void
1503 //
1504 //
1505 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
1506 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
1507 // IR-GPU-NEXT:  entry:
1508 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1509 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
1510 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
1511 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
1512 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1513 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1514 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1515 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1516 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1517 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
1518 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
1519 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
1520 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1521 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
1522 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
1523 // IR-GPU-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
1524 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1525 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
1526 // IR-GPU-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
1527 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
1528 // IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1529 // IR-GPU-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
1530 // IR-GPU-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1531 // IR-GPU-NEXT:    [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
1532 // IR-GPU-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
1533 // IR-GPU:       .shuffle.pre_cond:
1534 // IR-GPU-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
1535 // IR-GPU-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
1536 // IR-GPU-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
1537 // IR-GPU-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
1538 // IR-GPU-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
1539 // IR-GPU-NEXT:    [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
1540 // IR-GPU-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
1541 // IR-GPU-NEXT:    br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
1542 // IR-GPU:       .shuffle.then:
1543 // IR-GPU-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
1544 // IR-GPU-NEXT:    [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
1545 // IR-GPU-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
1546 // IR-GPU-NEXT:    [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
1547 // IR-GPU-NEXT:    store i64 [[TMP22]], ptr [[TMP13]], align 4
1548 // IR-GPU-NEXT:    [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
1549 // IR-GPU-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
1550 // IR-GPU-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
1551 // IR-GPU:       .shuffle.exit:
1552 // IR-GPU-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
1553 // IR-GPU-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
1554 // IR-GPU-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
1555 // IR-GPU-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
1556 // IR-GPU-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
1557 // IR-GPU-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
1558 // IR-GPU-NEXT:    [[TMP30:%.*]] = and i16 [[TMP5]], 1
1559 // IR-GPU-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
1560 // IR-GPU-NEXT:    [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
1561 // IR-GPU-NEXT:    [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
1562 // IR-GPU-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
1563 // IR-GPU-NEXT:    [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
1564 // IR-GPU-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
1565 // IR-GPU-NEXT:    br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
1566 // IR-GPU:       then:
1567 // IR-GPU-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
1568 // IR-GPU-NEXT:    br label [[IFCONT:%.*]]
1569 // IR-GPU:       else:
1570 // IR-GPU-NEXT:    br label [[IFCONT]]
1571 // IR-GPU:       ifcont:
1572 // IR-GPU-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
1573 // IR-GPU-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
1574 // IR-GPU-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
1575 // IR-GPU-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1576 // IR-GPU:       then4:
1577 // IR-GPU-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1578 // IR-GPU-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
1579 // IR-GPU-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1580 // IR-GPU-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
1581 // IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
1582 // IR-GPU-NEXT:    br label [[IFCONT6:%.*]]
1583 // IR-GPU:       else5:
1584 // IR-GPU-NEXT:    br label [[IFCONT6]]
1585 // IR-GPU:       ifcont6:
1586 // IR-GPU-NEXT:    ret void
1587 //
1588 //
1589 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
1590 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1591 // IR-GPU-NEXT:  entry:
1592 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1593 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1594 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1595 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
1596 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1597 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1598 // IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
1599 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1600 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1601 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1602 // IR-GPU-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1603 // IR-GPU-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
1604 // IR-GPU-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1605 // IR-GPU-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
1606 // IR-GPU-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1607 // IR-GPU-NEXT:    store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1608 // IR-GPU-NEXT:    br label [[PRECOND:%.*]]
1609 // IR-GPU:       precond:
1610 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1611 // IR-GPU-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
1612 // IR-GPU-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1613 // IR-GPU:       body:
1614 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
1615 // IR-GPU-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1616 // IR-GPU-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1617 // IR-GPU:       then:
1618 // IR-GPU-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1619 // IR-GPU-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
1620 // IR-GPU-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1621 // IR-GPU-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1622 // IR-GPU-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1623 // IR-GPU-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1624 // IR-GPU-NEXT:    br label [[IFCONT:%.*]]
1625 // IR-GPU:       else:
1626 // IR-GPU-NEXT:    br label [[IFCONT]]
1627 // IR-GPU:       ifcont:
1628 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1629 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1630 // IR-GPU-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1631 // IR-GPU-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1632 // IR-GPU:       then2:
1633 // IR-GPU-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1634 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1635 // IR-GPU-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1636 // IR-GPU-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1637 // IR-GPU-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1638 // IR-GPU-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
1639 // IR-GPU-NEXT:    br label [[IFCONT4:%.*]]
1640 // IR-GPU:       else3:
1641 // IR-GPU-NEXT:    br label [[IFCONT4]]
1642 // IR-GPU:       ifcont4:
1643 // IR-GPU-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1644 // IR-GPU-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
1645 // IR-GPU-NEXT:    br label [[PRECOND]]
1646 // IR-GPU:       exit:
1647 // IR-GPU-NEXT:    ret void
1648 //
1649 //
1650 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func.1
1651 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
1652 // IR-GPU-NEXT:  entry:
1653 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1654 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
1655 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
1656 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
1657 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1658 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1659 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1660 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1661 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1662 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
1663 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
1664 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
1665 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1666 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
1667 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
1668 // IR-GPU-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
1669 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1670 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
1671 // IR-GPU-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
1672 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
1673 // IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1674 // IR-GPU-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
1675 // IR-GPU-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1676 // IR-GPU-NEXT:    [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
1677 // IR-GPU-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
1678 // IR-GPU:       .shuffle.pre_cond:
1679 // IR-GPU-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
1680 // IR-GPU-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
1681 // IR-GPU-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
1682 // IR-GPU-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
1683 // IR-GPU-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
1684 // IR-GPU-NEXT:    [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
1685 // IR-GPU-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
1686 // IR-GPU-NEXT:    br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
1687 // IR-GPU:       .shuffle.then:
1688 // IR-GPU-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
1689 // IR-GPU-NEXT:    [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
1690 // IR-GPU-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
1691 // IR-GPU-NEXT:    [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
1692 // IR-GPU-NEXT:    store i64 [[TMP22]], ptr [[TMP13]], align 4
1693 // IR-GPU-NEXT:    [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
1694 // IR-GPU-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
1695 // IR-GPU-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
1696 // IR-GPU:       .shuffle.exit:
1697 // IR-GPU-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
1698 // IR-GPU-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
1699 // IR-GPU-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
1700 // IR-GPU-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
1701 // IR-GPU-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
1702 // IR-GPU-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
1703 // IR-GPU-NEXT:    [[TMP30:%.*]] = and i16 [[TMP5]], 1
1704 // IR-GPU-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
1705 // IR-GPU-NEXT:    [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
1706 // IR-GPU-NEXT:    [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
1707 // IR-GPU-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
1708 // IR-GPU-NEXT:    [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
1709 // IR-GPU-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
1710 // IR-GPU-NEXT:    br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
1711 // IR-GPU:       then:
1712 // IR-GPU-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
1713 // IR-GPU-NEXT:    br label [[IFCONT:%.*]]
1714 // IR-GPU:       else:
1715 // IR-GPU-NEXT:    br label [[IFCONT]]
1716 // IR-GPU:       ifcont:
1717 // IR-GPU-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
1718 // IR-GPU-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
1719 // IR-GPU-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
1720 // IR-GPU-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1721 // IR-GPU:       then4:
1722 // IR-GPU-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1723 // IR-GPU-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
1724 // IR-GPU-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1725 // IR-GPU-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
1726 // IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
1727 // IR-GPU-NEXT:    br label [[IFCONT6:%.*]]
1728 // IR-GPU:       else5:
1729 // IR-GPU-NEXT:    br label [[IFCONT6]]
1730 // IR-GPU:       ifcont6:
1731 // IR-GPU-NEXT:    ret void
1732 //
1733 //
1734 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func.2
1735 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1736 // IR-GPU-NEXT:  entry:
1737 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1738 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1739 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1740 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
1741 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1742 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1743 // IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
1744 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1745 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1746 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1747 // IR-GPU-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1748 // IR-GPU-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
1749 // IR-GPU-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1750 // IR-GPU-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
1751 // IR-GPU-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1752 // IR-GPU-NEXT:    store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1753 // IR-GPU-NEXT:    br label [[PRECOND:%.*]]
1754 // IR-GPU:       precond:
1755 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1756 // IR-GPU-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
1757 // IR-GPU-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1758 // IR-GPU:       body:
1759 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1760 // IR-GPU-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1761 // IR-GPU-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1762 // IR-GPU:       then:
1763 // IR-GPU-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1764 // IR-GPU-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
1765 // IR-GPU-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1766 // IR-GPU-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1767 // IR-GPU-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1768 // IR-GPU-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1769 // IR-GPU-NEXT:    br label [[IFCONT:%.*]]
1770 // IR-GPU:       else:
1771 // IR-GPU-NEXT:    br label [[IFCONT]]
1772 // IR-GPU:       ifcont:
1773 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1774 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1775 // IR-GPU-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1776 // IR-GPU-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1777 // IR-GPU:       then2:
1778 // IR-GPU-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1779 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1780 // IR-GPU-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1781 // IR-GPU-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1782 // IR-GPU-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1783 // IR-GPU-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
1784 // IR-GPU-NEXT:    br label [[IFCONT4:%.*]]
1785 // IR-GPU:       else3:
1786 // IR-GPU-NEXT:    br label [[IFCONT4]]
1787 // IR-GPU:       ifcont4:
1788 // IR-GPU-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1789 // IR-GPU-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
1790 // IR-GPU-NEXT:    br label [[PRECOND]]
1791 // IR-GPU:       exit:
1792 // IR-GPU-NEXT:    ret void
1793 //
1794 //
1795 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
1796 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1797 // IR-GPU-NEXT:  entry:
1798 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1799 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1800 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1801 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1802 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1803 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1804 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1805 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1806 // IR-GPU-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1807 // IR-GPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1808 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1809 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1810 // IR-GPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1811 // IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1812 // IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 [[TMP5]]
1813 // IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], ptr [[TMP8]], i32 0, i32 0
1814 // IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SUM]], ptr align 4 [[TMP7]], i64 400, i1 false)
1815 // IR-GPU-NEXT:    ret void
1816 //
1817 //
1818 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
1819 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1820 // IR-GPU-NEXT:  entry:
1821 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1822 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1823 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1824 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1825 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1826 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1827 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1828 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1829 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1830 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1831 // IR-GPU-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1832 // IR-GPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1833 // IR-GPU-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1834 // IR-GPU-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1835 // IR-GPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 [[TMP4]]
1836 // IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], ptr [[TMP6]], i32 0, i32 0
1837 // IR-GPU-NEXT:    store ptr [[SUM]], ptr [[TMP5]], align 8
1838 // IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1839 // IR-GPU-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr [[TMP7]]) #[[ATTR2]]
1840 // IR-GPU-NEXT:    ret void
1841 //
1842 //
1843 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
1844 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1845 // IR-GPU-NEXT:  entry:
1846 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1847 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1848 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1849 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1850 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1851 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1852 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1853 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1854 // IR-GPU-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1855 // IR-GPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1856 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1857 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1858 // IR-GPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1859 // IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1860 // IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 [[TMP5]]
1861 // IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], ptr [[TMP8]], i32 0, i32 0
1862 // IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 4 [[SUM]], i64 400, i1 false)
1863 // IR-GPU-NEXT:    ret void
1864 //
1865 //
1866 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
1867 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1868 // IR-GPU-NEXT:  entry:
1869 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1870 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1871 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1872 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1873 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1874 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1875 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1876 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1877 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1878 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1879 // IR-GPU-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1880 // IR-GPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1881 // IR-GPU-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1882 // IR-GPU-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1883 // IR-GPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 [[TMP4]]
1884 // IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], ptr [[TMP6]], i32 0, i32 0
1885 // IR-GPU-NEXT:    store ptr [[SUM]], ptr [[TMP5]], align 8
1886 // IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1887 // IR-GPU-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]]) #[[ATTR2]]
1888 // IR-GPU-NEXT:    ret void
1889 //
1890 //
1891 // IR-LABEL: define {{[^@]+}}@_Z3foov
1892 // IR-SAME: () #[[ATTR0:[0-9]+]] {
1893 // IR-NEXT:  entry:
1894 // IR-NEXT:    [[I:%.*]] = alloca i32, align 4
1895 // IR-NEXT:    [[J:%.*]] = alloca i32, align 4
1896 // IR-NEXT:    [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
1897 // IR-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
1898 // IR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[J]], align 4
1899 // IR-NEXT:    store i32 [[TMP0]], ptr [[J_CASTED]], align 4
1900 // IR-NEXT:    [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
1901 // IR-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
1902 // IR-NEXT:    ret i32 0
1903 //
1904 //
1905 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
1906 // IR-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
1907 // IR-NEXT:  entry:
1908 // IR-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
1909 // IR-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
1910 // IR-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
1911 // IR-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
1912 // IR-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
1913 // IR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
1914 // IR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
1915 // IR-NEXT:    store i32 [[TMP1]], ptr [[J_CASTED]], align 4
1916 // IR-NEXT:    [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
1917 // IR-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined, i64 [[TMP2]], ptr [[TMP0]])
1918 // IR-NEXT:    ret void
1919 //
1920 //
1921 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined
1922 // IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
1923 // IR-NEXT:  entry:
1924 // IR-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
1925 // IR-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
1926 // IR-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
1927 // IR-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
1928 // IR-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
1929 // IR-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
1930 // IR-NEXT:    [[TMP:%.*]] = alloca i32, align 4
1931 // IR-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
1932 // IR-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
1933 // IR-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
1934 // IR-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
1935 // IR-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
1936 // IR-NEXT:    [[J3:%.*]] = alloca i32, align 4
1937 // IR-NEXT:    [[I:%.*]] = alloca i32, align 4
1938 // IR-NEXT:    [[J4:%.*]] = alloca i32, align 4
1939 // IR-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
1940 // IR-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
1941 // IR-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
1942 // IR-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
1943 // IR-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
1944 // IR-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
1945 // IR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
1946 // IR-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
1947 // IR-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
1948 // IR-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
1949 // IR-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
1950 // IR:       omp.arrayinit.body:
1951 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
1952 // IR-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
1953 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
1954 // IR-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
1955 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
1956 // IR:       omp.arrayinit.done:
1957 // IR-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
1958 // IR-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
1959 // IR-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
1960 // IR-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
1961 // IR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
1962 // IR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
1963 // IR-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
1964 // IR-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1965 // IR-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
1966 // IR-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
1967 // IR:       cond.true:
1968 // IR-NEXT:    br label [[COND_END:%.*]]
1969 // IR:       cond.false:
1970 // IR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1971 // IR-NEXT:    br label [[COND_END]]
1972 // IR:       cond.end:
1973 // IR-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
1974 // IR-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
1975 // IR-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
1976 // IR-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
1977 // IR-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
1978 // IR:       omp.inner.for.cond:
1979 // IR-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
1980 // IR-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1981 // IR-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
1982 // IR-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
1983 // IR:       omp.inner.for.body:
1984 // IR-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
1985 // IR-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
1986 // IR-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1987 // IR-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
1988 // IR-NEXT:    [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
1989 // IR-NEXT:    store i32 [[TMP13]], ptr [[J_CASTED]], align 4
1990 // IR-NEXT:    [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
1991 // IR-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
1992 // IR-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
1993 // IR:       omp.inner.for.inc:
1994 // IR-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
1995 // IR-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
1996 // IR-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
1997 // IR-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
1998 // IR-NEXT:    br label [[OMP_INNER_FOR_COND]]
1999 // IR:       omp.inner.for.end:
2000 // IR-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
2001 // IR:       omp.loop.exit:
2002 // IR-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2003 // IR-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
2004 // IR-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP18]])
2005 // IR-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2006 // IR-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
2007 // IR-NEXT:    br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2008 // IR:       .omp.lastprivate.then:
2009 // IR-NEXT:    store i32 10, ptr [[J3]], align 4
2010 // IR-NEXT:    [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
2011 // IR-NEXT:    store i32 [[TMP21]], ptr [[J_ADDR]], align 4
2012 // IR-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
2013 // IR:       .omp.lastprivate.done:
2014 // IR-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2015 // IR-NEXT:    store ptr [[SUM1]], ptr [[TMP22]], align 8
2016 // IR-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2017 // IR-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
2018 // IR-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2019 // IR-NEXT:    switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2020 // IR-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2021 // IR-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2022 // IR-NEXT:    ]
2023 // IR:       .omp.reduction.case1:
2024 // IR-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2025 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
2026 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2027 // IR:       omp.arraycpy.body:
2028 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2029 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2030 // IR-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2031 // IR-NEXT:    [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2032 // IR-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
2033 // IR-NEXT:    store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2034 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
2035 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2036 // IR-NEXT:    [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
2037 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
2038 // IR:       omp.arraycpy.done10:
2039 // IR-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2040 // IR-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2041 // IR:       .omp.reduction.case2:
2042 // IR-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2043 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
2044 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
2045 // IR:       omp.arraycpy.body12:
2046 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2047 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2048 // IR-NEXT:    [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
2049 // IR-NEXT:    [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
2050 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
2051 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
2052 // IR-NEXT:    [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
2053 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
2054 // IR:       omp.arraycpy.done18:
2055 // IR-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2056 // IR-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2057 // IR:       .omp.reduction.default:
2058 // IR-NEXT:    ret void
2059 //
2060 //
2061 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined
2062 // IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
2063 // IR-NEXT:  entry:
2064 // IR-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
2065 // IR-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
2066 // IR-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
2067 // IR-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
2068 // IR-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
2069 // IR-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
2070 // IR-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
2071 // IR-NEXT:    [[TMP:%.*]] = alloca i32, align 4
2072 // IR-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
2073 // IR-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
2074 // IR-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
2075 // IR-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
2076 // IR-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
2077 // IR-NEXT:    [[J3:%.*]] = alloca i32, align 4
2078 // IR-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
2079 // IR-NEXT:    [[I:%.*]] = alloca i32, align 4
2080 // IR-NEXT:    [[J5:%.*]] = alloca i32, align 4
2081 // IR-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
2082 // IR-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
2083 // IR-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
2084 // IR-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2085 // IR-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2086 // IR-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
2087 // IR-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2088 // IR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2089 // IR-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
2090 // IR-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
2091 // IR-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2092 // IR-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
2093 // IR-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2094 // IR-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
2095 // IR-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
2096 // IR-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
2097 // IR-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
2098 // IR-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
2099 // IR-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
2100 // IR-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
2101 // IR-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
2102 // IR-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
2103 // IR:       omp.arrayinit.body:
2104 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
2105 // IR-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2106 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2107 // IR-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
2108 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
2109 // IR:       omp.arrayinit.done:
2110 // IR-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2111 // IR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
2112 // IR-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
2113 // IR-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2114 // IR-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
2115 // IR-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2116 // IR:       cond.true:
2117 // IR-NEXT:    br label [[COND_END:%.*]]
2118 // IR:       cond.false:
2119 // IR-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2120 // IR-NEXT:    br label [[COND_END]]
2121 // IR:       cond.end:
2122 // IR-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
2123 // IR-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
2124 // IR-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
2125 // IR-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
2126 // IR-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
2127 // IR:       omp.inner.for.cond:
2128 // IR-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
2129 // IR-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
2130 // IR-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
2131 // IR-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
2132 // IR:       omp.inner.for.body:
2133 // IR-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2134 // IR-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
2135 // IR-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
2136 // IR-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
2137 // IR-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2138 // IR-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2139 // IR-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2140 // IR-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
2141 // IR-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
2142 // IR-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
2143 // IR-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
2144 // IR-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
2145 // IR-NEXT:    store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2146 // IR-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2147 // IR-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2148 // IR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
2149 // IR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
2150 // IR-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2151 // IR-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
2152 // IR-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
2153 // IR-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2154 // IR-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
2155 // IR-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2156 // IR-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
2157 // IR:       omp.body.continue:
2158 // IR-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
2159 // IR:       omp.inner.for.inc:
2160 // IR-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2161 // IR-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
2162 // IR-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2163 // IR-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
2164 // IR:       omp.inner.for.end:
2165 // IR-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
2166 // IR:       omp.loop.exit:
2167 // IR-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2168 // IR-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
2169 // IR-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
2170 // IR-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2171 // IR-NEXT:    store ptr [[SUM4]], ptr [[TMP21]], align 8
2172 // IR-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2173 // IR-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
2174 // IR-NEXT:    [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2175 // IR-NEXT:    switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2176 // IR-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2177 // IR-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2178 // IR-NEXT:    ]
2179 // IR:       .omp.reduction.case1:
2180 // IR-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2181 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
2182 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2183 // IR:       omp.arraycpy.body:
2184 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2185 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2186 // IR-NEXT:    [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2187 // IR-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2188 // IR-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
2189 // IR-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2190 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
2191 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2192 // IR-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
2193 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
2194 // IR:       omp.arraycpy.done19:
2195 // IR-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2196 // IR-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2197 // IR:       .omp.reduction.case2:
2198 // IR-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2199 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
2200 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
2201 // IR:       omp.arraycpy.body21:
2202 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2203 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2204 // IR-NEXT:    [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
2205 // IR-NEXT:    [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
2206 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
2207 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
2208 // IR-NEXT:    [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
2209 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
2210 // IR:       omp.arraycpy.done27:
2211 // IR-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2212 // IR-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2213 // IR:       .omp.reduction.default:
2214 // IR-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2215 // IR-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
2216 // IR-NEXT:    br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2217 // IR:       .omp.lastprivate.then:
2218 // IR-NEXT:    store i32 10, ptr [[J3]], align 4
2219 // IR-NEXT:    [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
2220 // IR-NEXT:    store i32 [[TMP33]], ptr [[J_ADDR]], align 4
2221 // IR-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
2222 // IR:       .omp.lastprivate.done:
2223 // IR-NEXT:    ret void
2224 //
2225 //
2226 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func
2227 // IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
2228 // IR-NEXT:  entry:
2229 // IR-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
2230 // IR-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
2231 // IR-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2232 // IR-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2233 // IR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2234 // IR-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2235 // IR-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2236 // IR-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2237 // IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2238 // IR-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2239 // IR-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2240 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2241 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2242 // IR:       omp.arraycpy.body:
2243 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2244 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2245 // IR-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2246 // IR-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2247 // IR-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2248 // IR-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2249 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2250 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2251 // IR-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2252 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2253 // IR:       omp.arraycpy.done2:
2254 // IR-NEXT:    ret void
2255 //
2256 //
2257 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func
2258 // IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
2259 // IR-NEXT:  entry:
2260 // IR-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
2261 // IR-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
2262 // IR-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2263 // IR-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2264 // IR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2265 // IR-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2266 // IR-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2267 // IR-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2268 // IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2269 // IR-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2270 // IR-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2271 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2272 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2273 // IR:       omp.arraycpy.body:
2274 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2275 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2276 // IR-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2277 // IR-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2278 // IR-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2279 // IR-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2280 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2281 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2282 // IR-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2283 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2284 // IR:       omp.arraycpy.done2:
2285 // IR-NEXT:    ret void
2286 //
2287 //
2288 // IR-PCH-LABEL: define {{[^@]+}}@_Z3foov
2289 // IR-PCH-SAME: () #[[ATTR0:[0-9]+]] {
2290 // IR-PCH-NEXT:  entry:
2291 // IR-PCH-NEXT:    [[I:%.*]] = alloca i32, align 4
2292 // IR-PCH-NEXT:    [[J:%.*]] = alloca i32, align 4
2293 // IR-PCH-NEXT:    [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
2294 // IR-PCH-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
2295 // IR-PCH-NEXT:    [[TMP0:%.*]] = load i32, ptr [[J]], align 4
2296 // IR-PCH-NEXT:    store i32 [[TMP0]], ptr [[J_CASTED]], align 4
2297 // IR-PCH-NEXT:    [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
2298 // IR-PCH-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
2299 // IR-PCH-NEXT:    ret i32 0
2300 //
2301 //
2302 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
2303 // IR-PCH-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
2304 // IR-PCH-NEXT:  entry:
2305 // IR-PCH-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
2306 // IR-PCH-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
2307 // IR-PCH-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
2308 // IR-PCH-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
2309 // IR-PCH-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2310 // IR-PCH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2311 // IR-PCH-NEXT:    [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
2312 // IR-PCH-NEXT:    store i32 [[TMP1]], ptr [[J_CASTED]], align 4
2313 // IR-PCH-NEXT:    [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
2314 // IR-PCH-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined, i64 [[TMP2]], ptr [[TMP0]])
2315 // IR-PCH-NEXT:    ret void
2316 //
2317 //
2318 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined
2319 // IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
2320 // IR-PCH-NEXT:  entry:
2321 // IR-PCH-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
2322 // IR-PCH-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
2323 // IR-PCH-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
2324 // IR-PCH-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
2325 // IR-PCH-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
2326 // IR-PCH-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
2327 // IR-PCH-NEXT:    [[TMP:%.*]] = alloca i32, align 4
2328 // IR-PCH-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
2329 // IR-PCH-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
2330 // IR-PCH-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
2331 // IR-PCH-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
2332 // IR-PCH-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
2333 // IR-PCH-NEXT:    [[J3:%.*]] = alloca i32, align 4
2334 // IR-PCH-NEXT:    [[I:%.*]] = alloca i32, align 4
2335 // IR-PCH-NEXT:    [[J4:%.*]] = alloca i32, align 4
2336 // IR-PCH-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
2337 // IR-PCH-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
2338 // IR-PCH-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
2339 // IR-PCH-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
2340 // IR-PCH-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
2341 // IR-PCH-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2342 // IR-PCH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2343 // IR-PCH-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
2344 // IR-PCH-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
2345 // IR-PCH-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
2346 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
2347 // IR-PCH:       omp.arrayinit.body:
2348 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
2349 // IR-PCH-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2350 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2351 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
2352 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
2353 // IR-PCH:       omp.arrayinit.done:
2354 // IR-PCH-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
2355 // IR-PCH-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
2356 // IR-PCH-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
2357 // IR-PCH-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
2358 // IR-PCH-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2359 // IR-PCH-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
2360 // IR-PCH-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
2361 // IR-PCH-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2362 // IR-PCH-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
2363 // IR-PCH-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2364 // IR-PCH:       cond.true:
2365 // IR-PCH-NEXT:    br label [[COND_END:%.*]]
2366 // IR-PCH:       cond.false:
2367 // IR-PCH-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2368 // IR-PCH-NEXT:    br label [[COND_END]]
2369 // IR-PCH:       cond.end:
2370 // IR-PCH-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
2371 // IR-PCH-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
2372 // IR-PCH-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
2373 // IR-PCH-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
2374 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
2375 // IR-PCH:       omp.inner.for.cond:
2376 // IR-PCH-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
2377 // IR-PCH-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2378 // IR-PCH-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
2379 // IR-PCH-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
2380 // IR-PCH:       omp.inner.for.body:
2381 // IR-PCH-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
2382 // IR-PCH-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
2383 // IR-PCH-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2384 // IR-PCH-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
2385 // IR-PCH-NEXT:    [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
2386 // IR-PCH-NEXT:    store i32 [[TMP13]], ptr [[J_CASTED]], align 4
2387 // IR-PCH-NEXT:    [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
2388 // IR-PCH-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
2389 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
2390 // IR-PCH:       omp.inner.for.inc:
2391 // IR-PCH-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
2392 // IR-PCH-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
2393 // IR-PCH-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
2394 // IR-PCH-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
2395 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND]]
2396 // IR-PCH:       omp.inner.for.end:
2397 // IR-PCH-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
2398 // IR-PCH:       omp.loop.exit:
2399 // IR-PCH-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2400 // IR-PCH-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
2401 // IR-PCH-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP18]])
2402 // IR-PCH-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2403 // IR-PCH-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
2404 // IR-PCH-NEXT:    br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2405 // IR-PCH:       .omp.lastprivate.then:
2406 // IR-PCH-NEXT:    store i32 10, ptr [[J3]], align 4
2407 // IR-PCH-NEXT:    [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
2408 // IR-PCH-NEXT:    store i32 [[TMP21]], ptr [[J_ADDR]], align 4
2409 // IR-PCH-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
2410 // IR-PCH:       .omp.lastprivate.done:
2411 // IR-PCH-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2412 // IR-PCH-NEXT:    store ptr [[SUM1]], ptr [[TMP22]], align 8
2413 // IR-PCH-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2414 // IR-PCH-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
2415 // IR-PCH-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2416 // IR-PCH-NEXT:    switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2417 // IR-PCH-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2418 // IR-PCH-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2419 // IR-PCH-NEXT:    ]
2420 // IR-PCH:       .omp.reduction.case1:
2421 // IR-PCH-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2422 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
2423 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2424 // IR-PCH:       omp.arraycpy.body:
2425 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2426 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2427 // IR-PCH-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2428 // IR-PCH-NEXT:    [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2429 // IR-PCH-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
2430 // IR-PCH-NEXT:    store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2431 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
2432 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2433 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
2434 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
2435 // IR-PCH:       omp.arraycpy.done10:
2436 // IR-PCH-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2437 // IR-PCH-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2438 // IR-PCH:       .omp.reduction.case2:
2439 // IR-PCH-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2440 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
2441 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
2442 // IR-PCH:       omp.arraycpy.body12:
2443 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2444 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2445 // IR-PCH-NEXT:    [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
2446 // IR-PCH-NEXT:    [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
2447 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
2448 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
2449 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
2450 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
2451 // IR-PCH:       omp.arraycpy.done18:
2452 // IR-PCH-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2453 // IR-PCH-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2454 // IR-PCH:       .omp.reduction.default:
2455 // IR-PCH-NEXT:    ret void
2456 //
2457 //
2458 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined
2459 // IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
2460 // IR-PCH-NEXT:  entry:
2461 // IR-PCH-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
2462 // IR-PCH-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
2463 // IR-PCH-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
2464 // IR-PCH-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
2465 // IR-PCH-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
2466 // IR-PCH-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
2467 // IR-PCH-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
2468 // IR-PCH-NEXT:    [[TMP:%.*]] = alloca i32, align 4
2469 // IR-PCH-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
2470 // IR-PCH-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
2471 // IR-PCH-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
2472 // IR-PCH-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
2473 // IR-PCH-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
2474 // IR-PCH-NEXT:    [[J3:%.*]] = alloca i32, align 4
2475 // IR-PCH-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
2476 // IR-PCH-NEXT:    [[I:%.*]] = alloca i32, align 4
2477 // IR-PCH-NEXT:    [[J5:%.*]] = alloca i32, align 4
2478 // IR-PCH-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
2479 // IR-PCH-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
2480 // IR-PCH-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
2481 // IR-PCH-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2482 // IR-PCH-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2483 // IR-PCH-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
2484 // IR-PCH-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2485 // IR-PCH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2486 // IR-PCH-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
2487 // IR-PCH-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
2488 // IR-PCH-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2489 // IR-PCH-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
2490 // IR-PCH-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2491 // IR-PCH-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
2492 // IR-PCH-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
2493 // IR-PCH-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
2494 // IR-PCH-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
2495 // IR-PCH-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
2496 // IR-PCH-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
2497 // IR-PCH-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
2498 // IR-PCH-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
2499 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
2500 // IR-PCH:       omp.arrayinit.body:
2501 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
2502 // IR-PCH-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2503 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2504 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
2505 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
2506 // IR-PCH:       omp.arrayinit.done:
2507 // IR-PCH-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2508 // IR-PCH-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
2509 // IR-PCH-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
2510 // IR-PCH-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2511 // IR-PCH-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
2512 // IR-PCH-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2513 // IR-PCH:       cond.true:
2514 // IR-PCH-NEXT:    br label [[COND_END:%.*]]
2515 // IR-PCH:       cond.false:
2516 // IR-PCH-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2517 // IR-PCH-NEXT:    br label [[COND_END]]
2518 // IR-PCH:       cond.end:
2519 // IR-PCH-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
2520 // IR-PCH-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
2521 // IR-PCH-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
2522 // IR-PCH-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
2523 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
2524 // IR-PCH:       omp.inner.for.cond:
2525 // IR-PCH-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
2526 // IR-PCH-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
2527 // IR-PCH-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
2528 // IR-PCH-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
2529 // IR-PCH:       omp.inner.for.body:
2530 // IR-PCH-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2531 // IR-PCH-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
2532 // IR-PCH-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
2533 // IR-PCH-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
2534 // IR-PCH-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2535 // IR-PCH-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2536 // IR-PCH-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2537 // IR-PCH-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
2538 // IR-PCH-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
2539 // IR-PCH-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
2540 // IR-PCH-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
2541 // IR-PCH-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
2542 // IR-PCH-NEXT:    store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2543 // IR-PCH-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2544 // IR-PCH-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2545 // IR-PCH-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
2546 // IR-PCH-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
2547 // IR-PCH-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2548 // IR-PCH-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
2549 // IR-PCH-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
2550 // IR-PCH-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2551 // IR-PCH-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
2552 // IR-PCH-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2553 // IR-PCH-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
2554 // IR-PCH:       omp.body.continue:
2555 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
2556 // IR-PCH:       omp.inner.for.inc:
2557 // IR-PCH-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2558 // IR-PCH-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
2559 // IR-PCH-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2560 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
2561 // IR-PCH:       omp.inner.for.end:
2562 // IR-PCH-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
2563 // IR-PCH:       omp.loop.exit:
2564 // IR-PCH-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2565 // IR-PCH-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
2566 // IR-PCH-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
2567 // IR-PCH-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2568 // IR-PCH-NEXT:    store ptr [[SUM4]], ptr [[TMP21]], align 8
2569 // IR-PCH-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2570 // IR-PCH-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
2571 // IR-PCH-NEXT:    [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2572 // IR-PCH-NEXT:    switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2573 // IR-PCH-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2574 // IR-PCH-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2575 // IR-PCH-NEXT:    ]
2576 // IR-PCH:       .omp.reduction.case1:
2577 // IR-PCH-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2578 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
2579 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2580 // IR-PCH:       omp.arraycpy.body:
2581 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2582 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2583 // IR-PCH-NEXT:    [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2584 // IR-PCH-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2585 // IR-PCH-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
2586 // IR-PCH-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2587 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
2588 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2589 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
2590 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
2591 // IR-PCH:       omp.arraycpy.done19:
2592 // IR-PCH-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2593 // IR-PCH-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2594 // IR-PCH:       .omp.reduction.case2:
2595 // IR-PCH-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2596 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
2597 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
2598 // IR-PCH:       omp.arraycpy.body21:
2599 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2600 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2601 // IR-PCH-NEXT:    [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
2602 // IR-PCH-NEXT:    [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
2603 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
2604 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
2605 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
2606 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
2607 // IR-PCH:       omp.arraycpy.done27:
2608 // IR-PCH-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2609 // IR-PCH-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2610 // IR-PCH:       .omp.reduction.default:
2611 // IR-PCH-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2612 // IR-PCH-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
2613 // IR-PCH-NEXT:    br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2614 // IR-PCH:       .omp.lastprivate.then:
2615 // IR-PCH-NEXT:    store i32 10, ptr [[J3]], align 4
2616 // IR-PCH-NEXT:    [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
2617 // IR-PCH-NEXT:    store i32 [[TMP33]], ptr [[J_ADDR]], align 4
2618 // IR-PCH-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
2619 // IR-PCH:       .omp.lastprivate.done:
2620 // IR-PCH-NEXT:    ret void
2621 //
2622 //
2623 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func
2624 // IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
2625 // IR-PCH-NEXT:  entry:
2626 // IR-PCH-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
2627 // IR-PCH-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
2628 // IR-PCH-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2629 // IR-PCH-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2630 // IR-PCH-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2631 // IR-PCH-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2632 // IR-PCH-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2633 // IR-PCH-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2634 // IR-PCH-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2635 // IR-PCH-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2636 // IR-PCH-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2637 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2638 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2639 // IR-PCH:       omp.arraycpy.body:
2640 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2641 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2642 // IR-PCH-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2643 // IR-PCH-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2644 // IR-PCH-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2645 // IR-PCH-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2646 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2647 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2648 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2649 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2650 // IR-PCH:       omp.arraycpy.done2:
2651 // IR-PCH-NEXT:    ret void
2652 //
2653 //
2654 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func
2655 // IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
2656 // IR-PCH-NEXT:  entry:
2657 // IR-PCH-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
2658 // IR-PCH-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
2659 // IR-PCH-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2660 // IR-PCH-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2661 // IR-PCH-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2662 // IR-PCH-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2663 // IR-PCH-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2664 // IR-PCH-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2665 // IR-PCH-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2666 // IR-PCH-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2667 // IR-PCH-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2668 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2669 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2670 // IR-PCH:       omp.arraycpy.body:
2671 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2672 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2673 // IR-PCH-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2674 // IR-PCH-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2675 // IR-PCH-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2676 // IR-PCH-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2677 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2678 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2679 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2680 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2681 // IR-PCH:       omp.arraycpy.done2:
2682 // IR-PCH-NEXT:    ret void
2683 //
2684