xref: /llvm-project/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp (revision 921bd299134fbe17c676b2486af269e18281def4)
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // REQUIRES: amdgpu-registered-target
3 
4 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
5 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=IR-GPU
6 
7 // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
8 
9 // Check same results after serialization round-trip
10 // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-pch -o %t %s
11 // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR-PCH
12 
13 // expected-no-diagnostics
14 
15 #ifndef HEADER
16 #define HEADER
17 int foo() {
18   int i;
19   int j;
20   int sum[10][10];
21 
22   #pragma omp target teams loop reduction(+:sum) collapse(2) \
23       bind(parallel) order(concurrent) lastprivate(j) map(tofrom:sum)
24   for(i=0; i<10; i++)
25     for(j=0; j<10; j++)
26       sum[i][j] += i;
27 
28   return 0;
29 }
30 #endif
31 // IR-PCH-HOST-LABEL: define {{[^@]+}}@_Z3foov
32 // IR-PCH-HOST-SAME: () #[[ATTR0:[0-9]+]] {
33 // IR-PCH-HOST-NEXT:  entry:
34 // IR-PCH-HOST-NEXT:    [[I:%.*]] = alloca i32, align 4
35 // IR-PCH-HOST-NEXT:    [[J:%.*]] = alloca i32, align 4
36 // IR-PCH-HOST-NEXT:    [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
37 // IR-PCH-HOST-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
38 // IR-PCH-HOST-NEXT:    [[TMP0:%.*]] = load i32, ptr [[J]], align 4
39 // IR-PCH-HOST-NEXT:    store i32 [[TMP0]], ptr [[J_CASTED]], align 4
40 // IR-PCH-HOST-NEXT:    [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
41 // IR-PCH-HOST-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
42 // IR-PCH-HOST-NEXT:    ret i32 0
43 // IR-PCH-HOST-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
44 // IR-PCH-HOST-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
45 // IR-PCH-HOST-NEXT:  entry:
46 // IR-PCH-HOST-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
47 // IR-PCH-HOST-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
48 // IR-PCH-HOST-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
49 // IR-PCH-HOST-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
50 // IR-PCH-HOST-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
51 // IR-PCH-HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
52 // IR-PCH-HOST-NEXT:    [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
53 // IR-PCH-HOST-NEXT:    store i32 [[TMP1]], ptr [[J_CASTED]], align 4
54 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
55 // IR-PCH-HOST-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @.omp_outlined., i64 [[TMP2]], ptr [[TMP0]])
56 // IR-PCH-HOST-NEXT:    ret void
57 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp_outlined.
58 // IR-PCH-HOST-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
59 // IR-PCH-HOST-NEXT:  entry:
60 // IR-PCH-HOST-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
61 // IR-PCH-HOST-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
62 // IR-PCH-HOST-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
63 // IR-PCH-HOST-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
64 // IR-PCH-HOST-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
65 // IR-PCH-HOST-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
66 // IR-PCH-HOST-NEXT:    [[TMP:%.*]] = alloca i32, align 4
67 // IR-PCH-HOST-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
68 // IR-PCH-HOST-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
69 // IR-PCH-HOST-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
70 // IR-PCH-HOST-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
71 // IR-PCH-HOST-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
72 // IR-PCH-HOST-NEXT:    [[J3:%.*]] = alloca i32, align 4
73 // IR-PCH-HOST-NEXT:    [[I:%.*]] = alloca i32, align 4
74 // IR-PCH-HOST-NEXT:    [[J4:%.*]] = alloca i32, align 4
75 // IR-PCH-HOST-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
76 // IR-PCH-HOST-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
77 // IR-PCH-HOST-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
78 // IR-PCH-HOST-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
79 // IR-PCH-HOST-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
80 // IR-PCH-HOST-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
81 // IR-PCH-HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
82 // IR-PCH-HOST-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
83 // IR-PCH-HOST-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
84 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
85 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
86 // IR-PCH-HOST:       omp.arrayinit.body:
87 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
88 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
89 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
90 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
91 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
92 // IR-PCH-HOST:       omp.arrayinit.done:
93 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
94 // IR-PCH-HOST-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
95 // IR-PCH-HOST-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
96 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
97 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
98 // IR-PCH-HOST-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
99 // IR-PCH-HOST-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
100 // IR-PCH-HOST-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
101 // IR-PCH-HOST-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
102 // IR-PCH-HOST-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
103 // IR-PCH-HOST:       cond.true:
104 // IR-PCH-HOST-NEXT:    br label [[COND_END:%.*]]
105 // IR-PCH-HOST:       cond.false:
106 // IR-PCH-HOST-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
107 // IR-PCH-HOST-NEXT:    br label [[COND_END]]
108 // IR-PCH-HOST:       cond.end:
109 // IR-PCH-HOST-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
110 // IR-PCH-HOST-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
111 // IR-PCH-HOST-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
112 // IR-PCH-HOST-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
113 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
114 // IR-PCH-HOST:       omp.inner.for.cond:
115 // IR-PCH-HOST-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
116 // IR-PCH-HOST-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
117 // IR-PCH-HOST-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
118 // IR-PCH-HOST-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
119 // IR-PCH-HOST:       omp.inner.for.body:
120 // IR-PCH-HOST-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
121 // IR-PCH-HOST-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
122 // IR-PCH-HOST-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
123 // IR-PCH-HOST-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
124 // IR-PCH-HOST-NEXT:    [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
125 // IR-PCH-HOST-NEXT:    store i32 [[TMP13]], ptr [[J_CASTED]], align 4
126 // IR-PCH-HOST-NEXT:    [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
127 // IR-PCH-HOST-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @.omp_outlined..1, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
128 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
129 // IR-PCH-HOST:       omp.inner.for.inc:
130 // IR-PCH-HOST-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
131 // IR-PCH-HOST-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
132 // IR-PCH-HOST-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
133 // IR-PCH-HOST-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
134 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_COND]]
135 // IR-PCH-HOST:       omp.inner.for.end:
136 // IR-PCH-HOST-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
137 // IR-PCH-HOST:       omp.loop.exit:
138 // IR-PCH-HOST-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
139 // IR-PCH-HOST-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
140 // IR-PCH-HOST-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
141 // IR-PCH-HOST-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
142 // IR-PCH-HOST-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
143 // IR-PCH-HOST-NEXT:    br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
144 // IR-PCH-HOST:       .omp.lastprivate.then:
145 // IR-PCH-HOST-NEXT:    store i32 10, ptr [[J3]], align 4
146 // IR-PCH-HOST-NEXT:    [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
147 // IR-PCH-HOST-NEXT:    store i32 [[TMP21]], ptr [[J_ADDR]], align 4
148 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
149 // IR-PCH-HOST:       .omp.lastprivate.done:
150 // IR-PCH-HOST-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
151 // IR-PCH-HOST-NEXT:    store ptr [[SUM1]], ptr [[TMP22]], align 8
152 // IR-PCH-HOST-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
153 // IR-PCH-HOST-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
154 // IR-PCH-HOST-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @.omp.reduction.reduction_func.2, ptr @.gomp_critical_user_.reduction.var)
155 // IR-PCH-HOST-NEXT:    switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
156 // IR-PCH-HOST-NEXT:    i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
157 // IR-PCH-HOST-NEXT:    i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
158 // IR-PCH-HOST-NEXT:    ]
159 // IR-PCH-HOST:       .omp.reduction.case1:
160 // IR-PCH-HOST-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
161 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
162 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
163 // IR-PCH-HOST:       omp.arraycpy.body:
164 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
165 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
166 // IR-PCH-HOST-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
167 // IR-PCH-HOST-NEXT:    [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
168 // IR-PCH-HOST-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
169 // IR-PCH-HOST-NEXT:    store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
170 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
171 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
172 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
173 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
174 // IR-PCH-HOST:       omp.arraycpy.done10:
175 // IR-PCH-HOST-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
176 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
177 // IR-PCH-HOST:       .omp.reduction.case2:
178 // IR-PCH-HOST-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
179 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
180 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
181 // IR-PCH-HOST:       omp.arraycpy.body12:
182 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
183 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
184 // IR-PCH-HOST-NEXT:    [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
185 // IR-PCH-HOST-NEXT:    [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
186 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
187 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
188 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
189 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
190 // IR-PCH-HOST:       omp.arraycpy.done18:
191 // IR-PCH-HOST-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
192 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
193 // IR-PCH-HOST:       .omp.reduction.default:
194 // IR-PCH-HOST-NEXT:    ret void
195 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp_outlined..1
196 // IR-PCH-HOST-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
197 // IR-PCH-HOST-NEXT:  entry:
198 // IR-PCH-HOST-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
199 // IR-PCH-HOST-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
200 // IR-PCH-HOST-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
201 // IR-PCH-HOST-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
202 // IR-PCH-HOST-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
203 // IR-PCH-HOST-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
204 // IR-PCH-HOST-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
205 // IR-PCH-HOST-NEXT:    [[TMP:%.*]] = alloca i32, align 4
206 // IR-PCH-HOST-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
207 // IR-PCH-HOST-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
208 // IR-PCH-HOST-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
209 // IR-PCH-HOST-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
210 // IR-PCH-HOST-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
211 // IR-PCH-HOST-NEXT:    [[J3:%.*]] = alloca i32, align 4
212 // IR-PCH-HOST-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
213 // IR-PCH-HOST-NEXT:    [[I:%.*]] = alloca i32, align 4
214 // IR-PCH-HOST-NEXT:    [[J5:%.*]] = alloca i32, align 4
215 // IR-PCH-HOST-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
216 // IR-PCH-HOST-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
217 // IR-PCH-HOST-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
218 // IR-PCH-HOST-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
219 // IR-PCH-HOST-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
220 // IR-PCH-HOST-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
221 // IR-PCH-HOST-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
222 // IR-PCH-HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
223 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
224 // IR-PCH-HOST-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
225 // IR-PCH-HOST-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
226 // IR-PCH-HOST-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
227 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
228 // IR-PCH-HOST-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
229 // IR-PCH-HOST-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
230 // IR-PCH-HOST-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
231 // IR-PCH-HOST-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
232 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
233 // IR-PCH-HOST-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
234 // IR-PCH-HOST-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
235 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
236 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
237 // IR-PCH-HOST:       omp.arrayinit.body:
238 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
239 // IR-PCH-HOST-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
240 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
241 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
242 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
243 // IR-PCH-HOST:       omp.arrayinit.done:
244 // IR-PCH-HOST-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
245 // IR-PCH-HOST-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
246 // IR-PCH-HOST-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
247 // IR-PCH-HOST-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
248 // IR-PCH-HOST-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
249 // IR-PCH-HOST-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
250 // IR-PCH-HOST:       cond.true:
251 // IR-PCH-HOST-NEXT:    br label [[COND_END:%.*]]
252 // IR-PCH-HOST:       cond.false:
253 // IR-PCH-HOST-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
254 // IR-PCH-HOST-NEXT:    br label [[COND_END]]
255 // IR-PCH-HOST:       cond.end:
256 // IR-PCH-HOST-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
257 // IR-PCH-HOST-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
258 // IR-PCH-HOST-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
259 // IR-PCH-HOST-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
260 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
261 // IR-PCH-HOST:       omp.inner.for.cond:
262 // IR-PCH-HOST-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
263 // IR-PCH-HOST-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
264 // IR-PCH-HOST-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
265 // IR-PCH-HOST-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
266 // IR-PCH-HOST:       omp.inner.for.body:
267 // IR-PCH-HOST-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
268 // IR-PCH-HOST-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
269 // IR-PCH-HOST-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
270 // IR-PCH-HOST-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
271 // IR-PCH-HOST-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
272 // IR-PCH-HOST-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
273 // IR-PCH-HOST-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
274 // IR-PCH-HOST-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
275 // IR-PCH-HOST-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
276 // IR-PCH-HOST-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
277 // IR-PCH-HOST-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
278 // IR-PCH-HOST-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
279 // IR-PCH-HOST-NEXT:    store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
280 // IR-PCH-HOST-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
281 // IR-PCH-HOST-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
282 // IR-PCH-HOST-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
283 // IR-PCH-HOST-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
284 // IR-PCH-HOST-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
285 // IR-PCH-HOST-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
286 // IR-PCH-HOST-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
287 // IR-PCH-HOST-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
288 // IR-PCH-HOST-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
289 // IR-PCH-HOST-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
290 // IR-PCH-HOST-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
291 // IR-PCH-HOST:       omp.body.continue:
292 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
293 // IR-PCH-HOST:       omp.inner.for.inc:
294 // IR-PCH-HOST-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
295 // IR-PCH-HOST-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
296 // IR-PCH-HOST-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
297 // IR-PCH-HOST-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
298 // IR-PCH-HOST:       omp.inner.for.end:
299 // IR-PCH-HOST-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
300 // IR-PCH-HOST:       omp.loop.exit:
301 // IR-PCH-HOST-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
302 // IR-PCH-HOST-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
303 // IR-PCH-HOST-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
304 // IR-PCH-HOST-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
305 // IR-PCH-HOST-NEXT:    store ptr [[SUM4]], ptr [[TMP21]], align 8
306 // IR-PCH-HOST-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
307 // IR-PCH-HOST-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
308 // IR-PCH-HOST-NEXT:    [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
309 // IR-PCH-HOST-NEXT:    switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
310 // IR-PCH-HOST-NEXT:    i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
311 // IR-PCH-HOST-NEXT:    i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
312 // IR-PCH-HOST-NEXT:    ]
313 // IR-PCH-HOST:       .omp.reduction.case1:
314 // IR-PCH-HOST-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
315 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
316 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
317 // IR-PCH-HOST:       omp.arraycpy.body:
318 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
319 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
320 // IR-PCH-HOST-NEXT:    [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
321 // IR-PCH-HOST-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
322 // IR-PCH-HOST-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
323 // IR-PCH-HOST-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
324 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
325 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
326 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
327 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
328 // IR-PCH-HOST:       omp.arraycpy.done19:
329 // IR-PCH-HOST-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
330 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
331 // IR-PCH-HOST:       .omp.reduction.case2:
332 // IR-PCH-HOST-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
333 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
334 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
335 // IR-PCH-HOST:       omp.arraycpy.body21:
336 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
337 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
338 // IR-PCH-HOST-NEXT:    [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
339 // IR-PCH-HOST-NEXT:    [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
340 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
341 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
342 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
343 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
344 // IR-PCH-HOST:       omp.arraycpy.done27:
345 // IR-PCH-HOST-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
346 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
347 // IR-PCH-HOST:       .omp.reduction.default:
348 // IR-PCH-HOST-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
349 // IR-PCH-HOST-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
350 // IR-PCH-HOST-NEXT:    br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
351 // IR-PCH-HOST:       .omp.lastprivate.then:
352 // IR-PCH-HOST-NEXT:    store i32 10, ptr [[J3]], align 4
353 // IR-PCH-HOST-NEXT:    [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
354 // IR-PCH-HOST-NEXT:    store i32 [[TMP33]], ptr [[J_ADDR]], align 4
355 // IR-PCH-HOST-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
356 // IR-PCH-HOST:       .omp.lastprivate.done:
357 // IR-PCH-HOST-NEXT:    ret void
358 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
359 // IR-PCH-HOST-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
360 // IR-PCH-HOST-NEXT:  entry:
361 // IR-PCH-HOST-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
362 // IR-PCH-HOST-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
363 // IR-PCH-HOST-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
364 // IR-PCH-HOST-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
365 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
366 // IR-PCH-HOST-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
367 // IR-PCH-HOST-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
368 // IR-PCH-HOST-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
369 // IR-PCH-HOST-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
370 // IR-PCH-HOST-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
371 // IR-PCH-HOST-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
372 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
373 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
374 // IR-PCH-HOST:       omp.arraycpy.body:
375 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
376 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
377 // IR-PCH-HOST-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
378 // IR-PCH-HOST-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
379 // IR-PCH-HOST-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
380 // IR-PCH-HOST-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
381 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
382 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
383 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
384 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
385 // IR-PCH-HOST:       omp.arraycpy.done2:
386 // IR-PCH-HOST-NEXT:    ret void
387 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func.2
388 // IR-PCH-HOST-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
389 // IR-PCH-HOST-NEXT:  entry:
390 // IR-PCH-HOST-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
391 // IR-PCH-HOST-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
392 // IR-PCH-HOST-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
393 // IR-PCH-HOST-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
394 // IR-PCH-HOST-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
395 // IR-PCH-HOST-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
396 // IR-PCH-HOST-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
397 // IR-PCH-HOST-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
398 // IR-PCH-HOST-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
399 // IR-PCH-HOST-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
400 // IR-PCH-HOST-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
401 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
402 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
403 // IR-PCH-HOST:       omp.arraycpy.body:
404 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
405 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
406 // IR-PCH-HOST-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
407 // IR-PCH-HOST-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
408 // IR-PCH-HOST-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
409 // IR-PCH-HOST-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
410 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
411 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
412 // IR-PCH-HOST-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
413 // IR-PCH-HOST-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
414 // IR-PCH-HOST:       omp.arraycpy.done2:
415 // IR-PCH-HOST-NEXT:    ret void
416 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l23
417 // CHECK-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
418 // CHECK-NEXT:  entry:
419 // CHECK-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
420 // CHECK-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
421 // CHECK-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
422 // CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
423 // CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
424 // CHECK-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
425 // CHECK-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
426 // CHECK-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
427 // CHECK-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
428 // CHECK-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
429 // CHECK-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
430 // CHECK-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
431 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
432 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 2, i1 false)
433 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
434 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
435 // CHECK:       user_code.entry:
436 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
437 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4
438 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4
439 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
440 // CHECK-NEXT:    store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
441 // CHECK-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
442 // CHECK-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
443 // CHECK-NEXT:    call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
444 // CHECK-NEXT:    ret void
445 // CHECK:       worker.exit:
446 // CHECK-NEXT:    ret void
447 // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
448 // CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
449 // CHECK-NEXT:  entry:
450 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
451 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
452 // CHECK-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
453 // CHECK-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
454 // CHECK-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
455 // CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
456 // CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
457 // CHECK-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4, addrspace(5)
458 // CHECK-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4, addrspace(5)
459 // CHECK-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4, addrspace(5)
460 // CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
461 // CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
462 // CHECK-NEXT:    [[J3:%.*]] = alloca i32, align 4, addrspace(5)
463 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
464 // CHECK-NEXT:    [[J4:%.*]] = alloca i32, align 4, addrspace(5)
465 // CHECK-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
466 // CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
467 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
468 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
469 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
470 // CHECK-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
471 // CHECK-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
472 // CHECK-NEXT:    [[SUM1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM1]] to ptr
473 // CHECK-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
474 // CHECK-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
475 // CHECK-NEXT:    [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP2]] to ptr
476 // CHECK-NEXT:    [[DOTOMP_COMB_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB]] to ptr
477 // CHECK-NEXT:    [[DOTOMP_COMB_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB]] to ptr
478 // CHECK-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
479 // CHECK-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
480 // CHECK-NEXT:    [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
481 // CHECK-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
482 // CHECK-NEXT:    [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
483 // CHECK-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
484 // CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
485 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
486 // CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
487 // CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
488 // CHECK-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
489 // CHECK-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
490 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
491 // CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1_ASCAST]], i32 0, i32 0, i32 0
492 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
493 // CHECK-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
494 // CHECK-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
495 // CHECK:       omp.arrayinit.body:
496 // CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
497 // CHECK-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
498 // CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
499 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
500 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
501 // CHECK:       omp.arrayinit.done:
502 // CHECK-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
503 // CHECK-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
504 // CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
505 // CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
506 // CHECK-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
507 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
508 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
509 // CHECK-NEXT:    call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP3]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]])
510 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
511 // CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
512 // CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
513 // CHECK:       cond.true:
514 // CHECK-NEXT:    br label [[COND_END:%.*]]
515 // CHECK:       cond.false:
516 // CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
517 // CHECK-NEXT:    br label [[COND_END]]
518 // CHECK:       cond.end:
519 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
520 // CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
521 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
522 // CHECK-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
523 // CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
524 // CHECK:       omp.inner.for.cond:
525 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
526 // CHECK-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP7]], 100
527 // CHECK-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
528 // CHECK:       omp.inner.for.body:
529 // CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
530 // CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
531 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
532 // CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
533 // CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
534 // CHECK-NEXT:    store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4
535 // CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
536 // CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
537 // CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr
538 // CHECK-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8
539 // CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
540 // CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to ptr
541 // CHECK-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 8
542 // CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
543 // CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP13]] to ptr
544 // CHECK-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 8
545 // CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3
546 // CHECK-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8
547 // CHECK-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
548 // CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
549 // CHECK-NEXT:    call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP22]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__.1, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4)
550 // CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
551 // CHECK:       omp.inner.for.inc:
552 // CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
553 // CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
554 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
555 // CHECK-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV_ASCAST]], align 4
556 // CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
557 // CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
558 // CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
559 // CHECK-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
560 // CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
561 // CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
562 // CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
563 // CHECK-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
564 // CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
565 // CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[TMP29]], 99
566 // CHECK-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
567 // CHECK:       cond.true9:
568 // CHECK-NEXT:    br label [[COND_END11:%.*]]
569 // CHECK:       cond.false10:
570 // CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
571 // CHECK-NEXT:    br label [[COND_END11]]
572 // CHECK:       cond.end11:
573 // CHECK-NEXT:    [[COND12:%.*]] = phi i32 [ 99, [[COND_TRUE9]] ], [ [[TMP30]], [[COND_FALSE10]] ]
574 // CHECK-NEXT:    store i32 [[COND12]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
575 // CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
576 // CHECK-NEXT:    store i32 [[TMP31]], ptr [[DOTOMP_IV_ASCAST]], align 4
577 // CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]]
578 // CHECK:       omp.inner.for.end:
579 // CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
580 // CHECK:       omp.loop.exit:
581 // CHECK-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
582 // CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
583 // CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP33]])
584 // CHECK-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
585 // CHECK-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
586 // CHECK-NEXT:    br i1 [[TMP35]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
587 // CHECK:       .omp.lastprivate.then:
588 // CHECK-NEXT:    store i32 10, ptr [[J3_ASCAST]], align 4
589 // CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
590 // CHECK-NEXT:    store i32 [[TMP36]], ptr [[J_ADDR_ASCAST]], align 4
591 // CHECK-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
592 // CHECK:       .omp.lastprivate.done:
593 // CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
594 // CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
595 // CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
596 // CHECK-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
597 // CHECK-NEXT:    [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8
598 // CHECK-NEXT:    [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.3, ptr @_omp_reduction_inter_warp_copy_func.4, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
599 // CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1
600 // CHECK-NEXT:    br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
601 // CHECK:       .omp.reduction.then:
602 // CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
603 // CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]]
604 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
605 // CHECK:       omp.arraycpy.body:
606 // CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
607 // CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
608 // CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
609 // CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
610 // CHECK-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]]
611 // CHECK-NEXT:    store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
612 // CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
613 // CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
614 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
615 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
616 // CHECK:       omp.arraycpy.done17:
617 // CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
618 // CHECK:       .omp.reduction.done:
619 // CHECK-NEXT:    ret void
620 // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__.1
621 // CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
622 // CHECK-NEXT:  entry:
623 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
624 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
625 // CHECK-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
626 // CHECK-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
627 // CHECK-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
628 // CHECK-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
629 // CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
630 // CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
631 // CHECK-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4, addrspace(5)
632 // CHECK-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
633 // CHECK-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
634 // CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
635 // CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
636 // CHECK-NEXT:    [[J3:%.*]] = alloca i32, align 4, addrspace(5)
637 // CHECK-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
638 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
639 // CHECK-NEXT:    [[J5:%.*]] = alloca i32, align 4, addrspace(5)
640 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
641 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
642 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
643 // CHECK-NEXT:    [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
644 // CHECK-NEXT:    [[DOTPREVIOUS_UB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_UB__ADDR]] to ptr
645 // CHECK-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
646 // CHECK-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
647 // CHECK-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
648 // CHECK-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
649 // CHECK-NEXT:    [[TMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP1]] to ptr
650 // CHECK-NEXT:    [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
651 // CHECK-NEXT:    [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
652 // CHECK-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
653 // CHECK-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
654 // CHECK-NEXT:    [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
655 // CHECK-NEXT:    [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
656 // CHECK-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
657 // CHECK-NEXT:    [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
658 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
659 // CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
660 // CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
661 // CHECK-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
662 // CHECK-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
663 // CHECK-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
664 // CHECK-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
665 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
666 // CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
667 // CHECK-NEXT:    store i32 99, ptr [[DOTOMP_UB_ASCAST]], align 4
668 // CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
669 // CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
670 // CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
671 // CHECK-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
672 // CHECK-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB_ASCAST]], align 4
673 // CHECK-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB_ASCAST]], align 4
674 // CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
675 // CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
676 // CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i32 0, i32 0, i32 0
677 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
678 // CHECK-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
679 // CHECK-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
680 // CHECK:       omp.arrayinit.body:
681 // CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
682 // CHECK-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
683 // CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
684 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
685 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
686 // CHECK:       omp.arrayinit.done:
687 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
688 // CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
689 // CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP5]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
690 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
691 // CHECK-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
692 // CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
693 // CHECK:       omp.inner.for.cond:
694 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
695 // CHECK-NEXT:    [[CONV6:%.*]] = sext i32 [[TMP7]] to i64
696 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8, !llvm.access.group [[ACC_GRP7]]
697 // CHECK-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV6]], [[TMP8]]
698 // CHECK-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
699 // CHECK:       omp.inner.for.body:
700 // CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
701 // CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP9]], 10
702 // CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
703 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
704 // CHECK-NEXT:    store i32 [[ADD]], ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
705 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
706 // CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
707 // CHECK-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP11]], 10
708 // CHECK-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
709 // CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP10]], [[MUL8]]
710 // CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
711 // CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
712 // CHECK-NEXT:    store i32 [[ADD10]], ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
713 // CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
714 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
715 // CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
716 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i64 0, i64 [[IDXPROM]]
717 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
718 // CHECK-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP14]] to i64
719 // CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
720 // CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
721 // CHECK-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP15]], [[TMP12]]
722 // CHECK-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
723 // CHECK-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
724 // CHECK:       omp.body.continue:
725 // CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
726 // CHECK:       omp.inner.for.inc:
727 // CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
728 // CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
729 // CHECK-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
730 // CHECK-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
731 // CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
732 // CHECK:       omp.inner.for.end:
733 // CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
734 // CHECK:       omp.loop.exit:
735 // CHECK-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
736 // CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
737 // CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP19]])
738 // CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
739 // CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
740 // CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
741 // CHECK-NEXT:    store ptr [[SUM4_ASCAST]], ptr [[TMP22]], align 8
742 // CHECK-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP21]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
743 // CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP23]], 1
744 // CHECK-NEXT:    br i1 [[TMP24]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
745 // CHECK:       .omp.reduction.then:
746 // CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
747 // CHECK-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
748 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
749 // CHECK:       omp.arraycpy.body:
750 // CHECK-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
751 // CHECK-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
752 // CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
753 // CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
754 // CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
755 // CHECK-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
756 // CHECK-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
757 // CHECK-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
758 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
759 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
760 // CHECK:       omp.arraycpy.done19:
761 // CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
762 // CHECK:       .omp.reduction.done:
763 // CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
764 // CHECK-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
765 // CHECK-NEXT:    br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
766 // CHECK:       .omp.lastprivate.then:
767 // CHECK-NEXT:    store i32 10, ptr [[J3_ASCAST]], align 4
768 // CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
769 // CHECK-NEXT:    store i32 [[TMP30]], ptr [[J_ADDR_ASCAST]], align 4
770 // CHECK-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
771 // CHECK:       .omp.lastprivate.done:
772 // CHECK-NEXT:    ret void
773 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
774 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
775 // CHECK-NEXT:  entry:
776 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
777 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
778 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
779 // CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
780 // CHECK-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
781 // CHECK-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
782 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
783 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
784 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
785 // CHECK-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
786 // CHECK-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
787 // CHECK-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
788 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
789 // CHECK-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
790 // CHECK-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
791 // CHECK-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
792 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
793 // CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
794 // CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
795 // CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
796 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
797 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
798 // CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
799 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
800 // CHECK-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
801 // CHECK:       .shuffle.pre_cond:
802 // CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
803 // CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
804 // CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
805 // CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
806 // CHECK-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
807 // CHECK-NEXT:    [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
808 // CHECK-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
809 // CHECK-NEXT:    br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
810 // CHECK:       .shuffle.then:
811 // CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
812 // CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
813 // CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
814 // CHECK-NEXT:    [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
815 // CHECK-NEXT:    store i64 [[TMP22]], ptr [[TMP13]], align 4
816 // CHECK-NEXT:    [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
817 // CHECK-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
818 // CHECK-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
819 // CHECK:       .shuffle.exit:
820 // CHECK-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
821 // CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
822 // CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
823 // CHECK-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
824 // CHECK-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
825 // CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
826 // CHECK-NEXT:    [[TMP30:%.*]] = and i16 [[TMP5]], 1
827 // CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
828 // CHECK-NEXT:    [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
829 // CHECK-NEXT:    [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
830 // CHECK-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
831 // CHECK-NEXT:    [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
832 // CHECK-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
833 // CHECK-NEXT:    br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
834 // CHECK:       then:
835 // CHECK-NEXT:    call void @"_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
836 // CHECK-NEXT:    br label [[IFCONT:%.*]]
837 // CHECK:       else:
838 // CHECK-NEXT:    br label [[IFCONT]]
839 // CHECK:       ifcont:
840 // CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
841 // CHECK-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
842 // CHECK-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
843 // CHECK-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
844 // CHECK:       then4:
845 // CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
846 // CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
847 // CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
848 // CHECK-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
849 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
850 // CHECK-NEXT:    br label [[IFCONT6:%.*]]
851 // CHECK:       else5:
852 // CHECK-NEXT:    br label [[IFCONT6]]
853 // CHECK:       ifcont6:
854 // CHECK-NEXT:    ret void
855 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
856 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
857 // CHECK-NEXT:  entry:
858 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
859 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
860 // CHECK-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
861 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
862 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
863 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
864 // CHECK-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
865 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
866 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
867 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
868 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
869 // CHECK-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
870 // CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
871 // CHECK-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
872 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
873 // CHECK-NEXT:    store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
874 // CHECK-NEXT:    br label [[PRECOND:%.*]]
875 // CHECK:       precond:
876 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
877 // CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
878 // CHECK-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
879 // CHECK:       body:
880 // CHECK-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
881 // CHECK-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
882 // CHECK-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
883 // CHECK:       then:
884 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
885 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
886 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
887 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
888 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
889 // CHECK-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
890 // CHECK-NEXT:    br label [[IFCONT:%.*]]
891 // CHECK:       else:
892 // CHECK-NEXT:    br label [[IFCONT]]
893 // CHECK:       ifcont:
894 // CHECK-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
895 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
896 // CHECK-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
897 // CHECK-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
898 // CHECK:       then2:
899 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
900 // CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
901 // CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
902 // CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
903 // CHECK-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
904 // CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
905 // CHECK-NEXT:    br label [[IFCONT4:%.*]]
906 // CHECK:       else3:
907 // CHECK-NEXT:    br label [[IFCONT4]]
908 // CHECK:       ifcont4:
909 // CHECK-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
910 // CHECK-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
911 // CHECK-NEXT:    br label [[PRECOND]]
912 // CHECK:       exit:
913 // CHECK-NEXT:    ret void
914 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func.3
915 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
916 // CHECK-NEXT:  entry:
917 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
918 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
919 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
920 // CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
921 // CHECK-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
922 // CHECK-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
923 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
924 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
925 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
926 // CHECK-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
927 // CHECK-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
928 // CHECK-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
929 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
930 // CHECK-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
931 // CHECK-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
932 // CHECK-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
933 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
934 // CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
935 // CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
936 // CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
937 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
938 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
939 // CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
940 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
941 // CHECK-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
942 // CHECK:       .shuffle.pre_cond:
943 // CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
944 // CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
945 // CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
946 // CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
947 // CHECK-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
948 // CHECK-NEXT:    [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
949 // CHECK-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
950 // CHECK-NEXT:    br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
951 // CHECK:       .shuffle.then:
952 // CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
953 // CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
954 // CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
955 // CHECK-NEXT:    [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
956 // CHECK-NEXT:    store i64 [[TMP22]], ptr [[TMP13]], align 4
957 // CHECK-NEXT:    [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
958 // CHECK-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
959 // CHECK-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
960 // CHECK:       .shuffle.exit:
961 // CHECK-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
962 // CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
963 // CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
964 // CHECK-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
965 // CHECK-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
966 // CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
967 // CHECK-NEXT:    [[TMP30:%.*]] = and i16 [[TMP5]], 1
968 // CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
969 // CHECK-NEXT:    [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
970 // CHECK-NEXT:    [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
971 // CHECK-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
972 // CHECK-NEXT:    [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
973 // CHECK-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
974 // CHECK-NEXT:    br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
975 // CHECK:       then:
976 // CHECK-NEXT:    call void @"_omp$reduction$reduction_func.2"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
977 // CHECK-NEXT:    br label [[IFCONT:%.*]]
978 // CHECK:       else:
979 // CHECK-NEXT:    br label [[IFCONT]]
980 // CHECK:       ifcont:
981 // CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
982 // CHECK-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
983 // CHECK-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
984 // CHECK-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
985 // CHECK:       then4:
986 // CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
987 // CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
988 // CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
989 // CHECK-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
990 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
991 // CHECK-NEXT:    br label [[IFCONT6:%.*]]
992 // CHECK:       else5:
993 // CHECK-NEXT:    br label [[IFCONT6]]
994 // CHECK:       ifcont6:
995 // CHECK-NEXT:    ret void
996 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func.4
997 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
998 // CHECK-NEXT:  entry:
999 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1000 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1001 // CHECK-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1002 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
1003 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1004 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1005 // CHECK-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
1006 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1007 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1008 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1009 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1010 // CHECK-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
1011 // CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1012 // CHECK-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
1013 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1014 // CHECK-NEXT:    store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1015 // CHECK-NEXT:    br label [[PRECOND:%.*]]
1016 // CHECK:       precond:
1017 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1018 // CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
1019 // CHECK-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1020 // CHECK:       body:
1021 // CHECK-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1022 // CHECK-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1023 // CHECK-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1024 // CHECK:       then:
1025 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1026 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
1027 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1028 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1029 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1030 // CHECK-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1031 // CHECK-NEXT:    br label [[IFCONT:%.*]]
1032 // CHECK:       else:
1033 // CHECK-NEXT:    br label [[IFCONT]]
1034 // CHECK:       ifcont:
1035 // CHECK-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1036 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1037 // CHECK-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1038 // CHECK-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1039 // CHECK:       then2:
1040 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1041 // CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1042 // CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1043 // CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1044 // CHECK-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1045 // CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
1046 // CHECK-NEXT:    br label [[IFCONT4:%.*]]
1047 // CHECK:       else3:
1048 // CHECK-NEXT:    br label [[IFCONT4]]
1049 // CHECK:       ifcont4:
1050 // CHECK-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1051 // CHECK-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
1052 // CHECK-NEXT:    br label [[PRECOND]]
1053 // CHECK:       exit:
1054 // CHECK-NEXT:    ret void
1055 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
1056 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1057 // CHECK-NEXT:  entry:
1058 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1059 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1060 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1061 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1062 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1063 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1064 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1065 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1066 // CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1067 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1068 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1069 // CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1070 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1071 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1072 // CHECK-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1073 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1074 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[TMP8]], ptr align 4 [[TMP7]], i64 400, i1 false)
1075 // CHECK-NEXT:    ret void
1076 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
1077 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1078 // CHECK-NEXT:  entry:
1079 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1080 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1081 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1082 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1083 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1084 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1085 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1086 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1087 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1088 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1089 // CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1090 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1091 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1092 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1093 // CHECK-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1094 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1095 // CHECK-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
1096 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1097 // CHECK-NEXT:    call void @"_omp$reduction$reduction_func.2"(ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr [[TMP7]]) #[[ATTR2]]
1098 // CHECK-NEXT:    ret void
1099 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
1100 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1101 // CHECK-NEXT:  entry:
1102 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1103 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1104 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1105 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1106 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1107 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1108 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1109 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1110 // CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1111 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1112 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1113 // CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1114 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1115 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1116 // CHECK-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1117 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1118 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 128 [[TMP8]], i64 400, i1 false)
1119 // CHECK-NEXT:    ret void
1120 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
1121 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1122 // CHECK-NEXT:  entry:
1123 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1124 // CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1125 // CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1126 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1127 // CHECK-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1128 // CHECK-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1129 // CHECK-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1130 // CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1131 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1132 // CHECK-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1133 // CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1134 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1135 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1136 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1137 // CHECK-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1138 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1139 // CHECK-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
1140 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1141 // CHECK-NEXT:    call void @"_omp$reduction$reduction_func.2"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]]) #[[ATTR2]]
1142 // CHECK-NEXT:    ret void
1143 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
1144 // IR-GPU-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
1145 // IR-GPU-NEXT:  entry:
1146 // IR-GPU-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1147 // IR-GPU-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1148 // IR-GPU-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1149 // IR-GPU-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
1150 // IR-GPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1151 // IR-GPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
1152 // IR-GPU-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
1153 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
1154 // IR-GPU-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
1155 // IR-GPU-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
1156 // IR-GPU-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
1157 // IR-GPU-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
1158 // IR-GPU-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
1159 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
1160 // IR-GPU-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
1161 // IR-GPU-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
1162 // IR-GPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_kernel_environment to ptr), ptr [[DYN_PTR]])
1163 // IR-GPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
1164 // IR-GPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1165 // IR-GPU:       user_code.entry:
1166 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
1167 // IR-GPU-NEXT:    [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4
1168 // IR-GPU-NEXT:    store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4
1169 // IR-GPU-NEXT:    [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
1170 // IR-GPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
1171 // IR-GPU-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
1172 // IR-GPU-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
1173 // IR-GPU-NEXT:    call void @__kmpc_target_deinit()
1174 // IR-GPU-NEXT:    ret void
1175 // IR-GPU:       worker.exit:
1176 // IR-GPU-NEXT:    ret void
1177 //
1178 //
1179 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined
1180 // IR-GPU-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
1181 // IR-GPU-NEXT:  entry:
1182 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1183 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1184 // IR-GPU-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1185 // IR-GPU-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1186 // IR-GPU-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1187 // IR-GPU-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
1188 // IR-GPU-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
1189 // IR-GPU-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4, addrspace(5)
1190 // IR-GPU-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4, addrspace(5)
1191 // IR-GPU-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4, addrspace(5)
1192 // IR-GPU-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
1193 // IR-GPU-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
1194 // IR-GPU-NEXT:    [[J3:%.*]] = alloca i32, align 4, addrspace(5)
1195 // IR-GPU-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
1196 // IR-GPU-NEXT:    [[J4:%.*]] = alloca i32, align 4, addrspace(5)
1197 // IR-GPU-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
1198 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
1199 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1200 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
1201 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
1202 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
1203 // IR-GPU-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
1204 // IR-GPU-NEXT:    [[SUM1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM1]] to ptr
1205 // IR-GPU-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
1206 // IR-GPU-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
1207 // IR-GPU-NEXT:    [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP2]] to ptr
1208 // IR-GPU-NEXT:    [[DOTOMP_COMB_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB]] to ptr
1209 // IR-GPU-NEXT:    [[DOTOMP_COMB_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB]] to ptr
1210 // IR-GPU-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
1211 // IR-GPU-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
1212 // IR-GPU-NEXT:    [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
1213 // IR-GPU-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
1214 // IR-GPU-NEXT:    [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
1215 // IR-GPU-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
1216 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
1217 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1218 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1219 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
1220 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
1221 // IR-GPU-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
1222 // IR-GPU-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
1223 // IR-GPU-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1_ASCAST]], i32 0, i32 0, i32 0
1224 // IR-GPU-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
1225 // IR-GPU-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
1226 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
1227 // IR-GPU:       omp.arrayinit.body:
1228 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
1229 // IR-GPU-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
1230 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
1231 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
1232 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
1233 // IR-GPU:       omp.arrayinit.done:
1234 // IR-GPU-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1235 // IR-GPU-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1236 // IR-GPU-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1237 // IR-GPU-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1238 // IR-GPU-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1239 // IR-GPU-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1240 // IR-GPU-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
1241 // IR-GPU-NEXT:    call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP3]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]])
1242 // IR-GPU-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1243 // IR-GPU-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
1244 // IR-GPU-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
1245 // IR-GPU:       cond.true:
1246 // IR-GPU-NEXT:    br label [[COND_END:%.*]]
1247 // IR-GPU:       cond.false:
1248 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1249 // IR-GPU-NEXT:    br label [[COND_END]]
1250 // IR-GPU:       cond.end:
1251 // IR-GPU-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
1252 // IR-GPU-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1253 // IR-GPU-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1254 // IR-GPU-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
1255 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
1256 // IR-GPU:       omp.inner.for.cond:
1257 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
1258 // IR-GPU-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP7]], 100
1259 // IR-GPU-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
1260 // IR-GPU:       omp.inner.for.body:
1261 // IR-GPU-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1262 // IR-GPU-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
1263 // IR-GPU-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1264 // IR-GPU-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
1265 // IR-GPU-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
1266 // IR-GPU-NEXT:    store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4
1267 // IR-GPU-NEXT:    [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
1268 // IR-GPU-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
1269 // IR-GPU-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr
1270 // IR-GPU-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8
1271 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
1272 // IR-GPU-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to ptr
1273 // IR-GPU-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 8
1274 // IR-GPU-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
1275 // IR-GPU-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP13]] to ptr
1276 // IR-GPU-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 8
1277 // IR-GPU-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3
1278 // IR-GPU-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8
1279 // IR-GPU-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1280 // IR-GPU-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
1281 // IR-GPU-NEXT:    call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP22]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4)
1282 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
1283 // IR-GPU:       omp.inner.for.inc:
1284 // IR-GPU-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
1285 // IR-GPU-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1286 // IR-GPU-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
1287 // IR-GPU-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV_ASCAST]], align 4
1288 // IR-GPU-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1289 // IR-GPU-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1290 // IR-GPU-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
1291 // IR-GPU-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1292 // IR-GPU-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1293 // IR-GPU-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1294 // IR-GPU-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
1295 // IR-GPU-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1296 // IR-GPU-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1297 // IR-GPU-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[TMP29]], 99
1298 // IR-GPU-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
1299 // IR-GPU:       cond.true9:
1300 // IR-GPU-NEXT:    br label [[COND_END11:%.*]]
1301 // IR-GPU:       cond.false10:
1302 // IR-GPU-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1303 // IR-GPU-NEXT:    br label [[COND_END11]]
1304 // IR-GPU:       cond.end11:
1305 // IR-GPU-NEXT:    [[COND12:%.*]] = phi i32 [ 99, [[COND_TRUE9]] ], [ [[TMP30]], [[COND_FALSE10]] ]
1306 // IR-GPU-NEXT:    store i32 [[COND12]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1307 // IR-GPU-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1308 // IR-GPU-NEXT:    store i32 [[TMP31]], ptr [[DOTOMP_IV_ASCAST]], align 4
1309 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_COND]]
1310 // IR-GPU:       omp.inner.for.end:
1311 // IR-GPU-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
1312 // IR-GPU:       omp.loop.exit:
1313 // IR-GPU-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1314 // IR-GPU-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
1315 // IR-GPU-NEXT:    call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP33]])
1316 // IR-GPU-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1317 // IR-GPU-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
1318 // IR-GPU-NEXT:    br i1 [[TMP35]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
1319 // IR-GPU:       .omp.lastprivate.then:
1320 // IR-GPU-NEXT:    store i32 10, ptr [[J3_ASCAST]], align 4
1321 // IR-GPU-NEXT:    [[TMP36:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
1322 // IR-GPU-NEXT:    store i32 [[TMP36]], ptr [[J_ADDR_ASCAST]], align 4
1323 // IR-GPU-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
1324 // IR-GPU:       .omp.lastprivate.done:
1325 // IR-GPU-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1326 // IR-GPU-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
1327 // IR-GPU-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1328 // IR-GPU-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
1329 // IR-GPU-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
1330 // IR-GPU-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
1331 // IR-GPU-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[TMP40]], 1
1332 // IR-GPU-NEXT:    br i1 [[TMP41]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1333 // IR-GPU:       .omp.reduction.then:
1334 // IR-GPU-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
1335 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP42]]
1336 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
1337 // IR-GPU:       omp.arraycpy.body:
1338 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1339 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1340 // IR-GPU-NEXT:    [[TMP43:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
1341 // IR-GPU-NEXT:    [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
1342 // IR-GPU-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP43]], [[TMP44]]
1343 // IR-GPU-NEXT:    store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
1344 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
1345 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
1346 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP42]]
1347 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
1348 // IR-GPU:       omp.arraycpy.done17:
1349 // IR-GPU-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
1350 // IR-GPU:       .omp.reduction.done:
1351 // IR-GPU-NEXT:    ret void
1352 //
1353 //
1354 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined
1355 // IR-GPU-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
1356 // IR-GPU-NEXT:  entry:
1357 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1358 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1359 // IR-GPU-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1360 // IR-GPU-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1361 // IR-GPU-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1362 // IR-GPU-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1363 // IR-GPU-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
1364 // IR-GPU-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
1365 // IR-GPU-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4, addrspace(5)
1366 // IR-GPU-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
1367 // IR-GPU-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
1368 // IR-GPU-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
1369 // IR-GPU-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
1370 // IR-GPU-NEXT:    [[J3:%.*]] = alloca i32, align 4, addrspace(5)
1371 // IR-GPU-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1372 // IR-GPU-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
1373 // IR-GPU-NEXT:    [[J5:%.*]] = alloca i32, align 4, addrspace(5)
1374 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1375 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
1376 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
1377 // IR-GPU-NEXT:    [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
1378 // IR-GPU-NEXT:    [[DOTPREVIOUS_UB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_UB__ADDR]] to ptr
1379 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
1380 // IR-GPU-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
1381 // IR-GPU-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
1382 // IR-GPU-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
1383 // IR-GPU-NEXT:    [[TMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP1]] to ptr
1384 // IR-GPU-NEXT:    [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
1385 // IR-GPU-NEXT:    [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
1386 // IR-GPU-NEXT:    [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
1387 // IR-GPU-NEXT:    [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
1388 // IR-GPU-NEXT:    [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
1389 // IR-GPU-NEXT:    [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
1390 // IR-GPU-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
1391 // IR-GPU-NEXT:    [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
1392 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1393 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1394 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
1395 // IR-GPU-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
1396 // IR-GPU-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
1397 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
1398 // IR-GPU-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
1399 // IR-GPU-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
1400 // IR-GPU-NEXT:    store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
1401 // IR-GPU-NEXT:    store i32 99, ptr [[DOTOMP_UB_ASCAST]], align 4
1402 // IR-GPU-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
1403 // IR-GPU-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
1404 // IR-GPU-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
1405 // IR-GPU-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
1406 // IR-GPU-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB_ASCAST]], align 4
1407 // IR-GPU-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB_ASCAST]], align 4
1408 // IR-GPU-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1409 // IR-GPU-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1410 // IR-GPU-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i32 0, i32 0, i32 0
1411 // IR-GPU-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
1412 // IR-GPU-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
1413 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
1414 // IR-GPU:       omp.arrayinit.body:
1415 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
1416 // IR-GPU-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
1417 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
1418 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
1419 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
1420 // IR-GPU:       omp.arrayinit.done:
1421 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1422 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
1423 // IR-GPU-NEXT:    call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP5]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
1424 // IR-GPU-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
1425 // IR-GPU-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
1426 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
1427 // IR-GPU:       omp.inner.for.cond:
1428 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
1429 // IR-GPU-NEXT:    [[CONV6:%.*]] = sext i32 [[TMP7]] to i64
1430 // IR-GPU-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8, !llvm.access.group [[ACC_GRP7]]
1431 // IR-GPU-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV6]], [[TMP8]]
1432 // IR-GPU-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
1433 // IR-GPU:       omp.inner.for.body:
1434 // IR-GPU-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1435 // IR-GPU-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP9]], 10
1436 // IR-GPU-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
1437 // IR-GPU-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
1438 // IR-GPU-NEXT:    store i32 [[ADD]], ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1439 // IR-GPU-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1440 // IR-GPU-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1441 // IR-GPU-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP11]], 10
1442 // IR-GPU-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
1443 // IR-GPU-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP10]], [[MUL8]]
1444 // IR-GPU-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
1445 // IR-GPU-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
1446 // IR-GPU-NEXT:    store i32 [[ADD10]], ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1447 // IR-GPU-NEXT:    [[TMP12:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1448 // IR-GPU-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1449 // IR-GPU-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
1450 // IR-GPU-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i64 0, i64 [[IDXPROM]]
1451 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1452 // IR-GPU-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP14]] to i64
1453 // IR-GPU-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
1454 // IR-GPU-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
1455 // IR-GPU-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP15]], [[TMP12]]
1456 // IR-GPU-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
1457 // IR-GPU-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
1458 // IR-GPU:       omp.body.continue:
1459 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
1460 // IR-GPU:       omp.inner.for.inc:
1461 // IR-GPU-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1462 // IR-GPU-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1463 // IR-GPU-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
1464 // IR-GPU-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1465 // IR-GPU-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
1466 // IR-GPU:       omp.inner.for.end:
1467 // IR-GPU-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
1468 // IR-GPU:       omp.loop.exit:
1469 // IR-GPU-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1470 // IR-GPU-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
1471 // IR-GPU-NEXT:    call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP19]])
1472 // IR-GPU-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1473 // IR-GPU-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
1474 // IR-GPU-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1475 // IR-GPU-NEXT:    store ptr [[SUM4_ASCAST]], ptr [[TMP22]], align 8
1476 // IR-GPU-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP21]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
1477 // IR-GPU-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP23]], 1
1478 // IR-GPU-NEXT:    br i1 [[TMP24]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1479 // IR-GPU:       .omp.reduction.then:
1480 // IR-GPU-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
1481 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
1482 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
1483 // IR-GPU:       omp.arraycpy.body:
1484 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1485 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1486 // IR-GPU-NEXT:    [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
1487 // IR-GPU-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
1488 // IR-GPU-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
1489 // IR-GPU-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
1490 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
1491 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
1492 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
1493 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
1494 // IR-GPU:       omp.arraycpy.done19:
1495 // IR-GPU-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
1496 // IR-GPU:       .omp.reduction.done:
1497 // IR-GPU-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1498 // IR-GPU-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
1499 // IR-GPU-NEXT:    br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
1500 // IR-GPU:       .omp.lastprivate.then:
1501 // IR-GPU-NEXT:    store i32 10, ptr [[J3_ASCAST]], align 4
1502 // IR-GPU-NEXT:    [[TMP30:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
1503 // IR-GPU-NEXT:    store i32 [[TMP30]], ptr [[J_ADDR_ASCAST]], align 4
1504 // IR-GPU-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
1505 // IR-GPU:       .omp.lastprivate.done:
1506 // IR-GPU-NEXT:    ret void
1507 //
1508 //
1509 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
1510 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
1511 // IR-GPU-NEXT:  entry:
1512 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1513 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
1514 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
1515 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
1516 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1517 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1518 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1519 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1520 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1521 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
1522 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
1523 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
1524 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1525 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
1526 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
1527 // IR-GPU-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
1528 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1529 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
1530 // IR-GPU-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
1531 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
1532 // IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1533 // IR-GPU-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
1534 // IR-GPU-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1535 // IR-GPU-NEXT:    [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
1536 // IR-GPU-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
1537 // IR-GPU:       .shuffle.pre_cond:
1538 // IR-GPU-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
1539 // IR-GPU-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
1540 // IR-GPU-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
1541 // IR-GPU-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
1542 // IR-GPU-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
1543 // IR-GPU-NEXT:    [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
1544 // IR-GPU-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
1545 // IR-GPU-NEXT:    br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
1546 // IR-GPU:       .shuffle.then:
1547 // IR-GPU-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
1548 // IR-GPU-NEXT:    [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
1549 // IR-GPU-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
1550 // IR-GPU-NEXT:    [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
1551 // IR-GPU-NEXT:    store i64 [[TMP22]], ptr [[TMP13]], align 4
1552 // IR-GPU-NEXT:    [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
1553 // IR-GPU-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
1554 // IR-GPU-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
1555 // IR-GPU:       .shuffle.exit:
1556 // IR-GPU-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
1557 // IR-GPU-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
1558 // IR-GPU-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
1559 // IR-GPU-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
1560 // IR-GPU-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
1561 // IR-GPU-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
1562 // IR-GPU-NEXT:    [[TMP30:%.*]] = and i16 [[TMP5]], 1
1563 // IR-GPU-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
1564 // IR-GPU-NEXT:    [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
1565 // IR-GPU-NEXT:    [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
1566 // IR-GPU-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
1567 // IR-GPU-NEXT:    [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
1568 // IR-GPU-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
1569 // IR-GPU-NEXT:    br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
1570 // IR-GPU:       then:
1571 // IR-GPU-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
1572 // IR-GPU-NEXT:    br label [[IFCONT:%.*]]
1573 // IR-GPU:       else:
1574 // IR-GPU-NEXT:    br label [[IFCONT]]
1575 // IR-GPU:       ifcont:
1576 // IR-GPU-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
1577 // IR-GPU-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
1578 // IR-GPU-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
1579 // IR-GPU-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1580 // IR-GPU:       then4:
1581 // IR-GPU-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1582 // IR-GPU-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
1583 // IR-GPU-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1584 // IR-GPU-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
1585 // IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
1586 // IR-GPU-NEXT:    br label [[IFCONT6:%.*]]
1587 // IR-GPU:       else5:
1588 // IR-GPU-NEXT:    br label [[IFCONT6]]
1589 // IR-GPU:       ifcont6:
1590 // IR-GPU-NEXT:    ret void
1591 //
1592 //
1593 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
1594 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1595 // IR-GPU-NEXT:  entry:
1596 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1597 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1598 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1599 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
1600 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1601 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1602 // IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
1603 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1604 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1605 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1606 // IR-GPU-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1607 // IR-GPU-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
1608 // IR-GPU-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1609 // IR-GPU-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
1610 // IR-GPU-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1611 // IR-GPU-NEXT:    store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1612 // IR-GPU-NEXT:    br label [[PRECOND:%.*]]
1613 // IR-GPU:       precond:
1614 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1615 // IR-GPU-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
1616 // IR-GPU-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1617 // IR-GPU:       body:
1618 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
1619 // IR-GPU-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1620 // IR-GPU-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1621 // IR-GPU:       then:
1622 // IR-GPU-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1623 // IR-GPU-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
1624 // IR-GPU-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1625 // IR-GPU-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1626 // IR-GPU-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1627 // IR-GPU-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1628 // IR-GPU-NEXT:    br label [[IFCONT:%.*]]
1629 // IR-GPU:       else:
1630 // IR-GPU-NEXT:    br label [[IFCONT]]
1631 // IR-GPU:       ifcont:
1632 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1633 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1634 // IR-GPU-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1635 // IR-GPU-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1636 // IR-GPU:       then2:
1637 // IR-GPU-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1638 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1639 // IR-GPU-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1640 // IR-GPU-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1641 // IR-GPU-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1642 // IR-GPU-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
1643 // IR-GPU-NEXT:    br label [[IFCONT4:%.*]]
1644 // IR-GPU:       else3:
1645 // IR-GPU-NEXT:    br label [[IFCONT4]]
1646 // IR-GPU:       ifcont4:
1647 // IR-GPU-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1648 // IR-GPU-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
1649 // IR-GPU-NEXT:    br label [[PRECOND]]
1650 // IR-GPU:       exit:
1651 // IR-GPU-NEXT:    ret void
1652 //
1653 //
1654 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func.1
1655 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
1656 // IR-GPU-NEXT:  entry:
1657 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1658 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
1659 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
1660 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
1661 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1662 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1663 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1664 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1665 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1666 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
1667 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
1668 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
1669 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1670 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
1671 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
1672 // IR-GPU-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
1673 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1674 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
1675 // IR-GPU-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
1676 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
1677 // IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1678 // IR-GPU-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
1679 // IR-GPU-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1680 // IR-GPU-NEXT:    [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
1681 // IR-GPU-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
1682 // IR-GPU:       .shuffle.pre_cond:
1683 // IR-GPU-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
1684 // IR-GPU-NEXT:    [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
1685 // IR-GPU-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
1686 // IR-GPU-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
1687 // IR-GPU-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
1688 // IR-GPU-NEXT:    [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
1689 // IR-GPU-NEXT:    [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
1690 // IR-GPU-NEXT:    br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
1691 // IR-GPU:       .shuffle.then:
1692 // IR-GPU-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
1693 // IR-GPU-NEXT:    [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
1694 // IR-GPU-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
1695 // IR-GPU-NEXT:    [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
1696 // IR-GPU-NEXT:    store i64 [[TMP22]], ptr [[TMP13]], align 4
1697 // IR-GPU-NEXT:    [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
1698 // IR-GPU-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
1699 // IR-GPU-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
1700 // IR-GPU:       .shuffle.exit:
1701 // IR-GPU-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
1702 // IR-GPU-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
1703 // IR-GPU-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
1704 // IR-GPU-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
1705 // IR-GPU-NEXT:    [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
1706 // IR-GPU-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
1707 // IR-GPU-NEXT:    [[TMP30:%.*]] = and i16 [[TMP5]], 1
1708 // IR-GPU-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
1709 // IR-GPU-NEXT:    [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
1710 // IR-GPU-NEXT:    [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
1711 // IR-GPU-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
1712 // IR-GPU-NEXT:    [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
1713 // IR-GPU-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
1714 // IR-GPU-NEXT:    br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
1715 // IR-GPU:       then:
1716 // IR-GPU-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
1717 // IR-GPU-NEXT:    br label [[IFCONT:%.*]]
1718 // IR-GPU:       else:
1719 // IR-GPU-NEXT:    br label [[IFCONT]]
1720 // IR-GPU:       ifcont:
1721 // IR-GPU-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
1722 // IR-GPU-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
1723 // IR-GPU-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
1724 // IR-GPU-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1725 // IR-GPU:       then4:
1726 // IR-GPU-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1727 // IR-GPU-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
1728 // IR-GPU-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1729 // IR-GPU-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
1730 // IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
1731 // IR-GPU-NEXT:    br label [[IFCONT6:%.*]]
1732 // IR-GPU:       else5:
1733 // IR-GPU-NEXT:    br label [[IFCONT6]]
1734 // IR-GPU:       ifcont6:
1735 // IR-GPU-NEXT:    ret void
1736 //
1737 //
1738 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func.2
1739 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1740 // IR-GPU-NEXT:  entry:
1741 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1742 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1743 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1744 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
1745 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1746 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1747 // IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
1748 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1749 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1750 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1751 // IR-GPU-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1752 // IR-GPU-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
1753 // IR-GPU-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1754 // IR-GPU-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
1755 // IR-GPU-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1756 // IR-GPU-NEXT:    store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1757 // IR-GPU-NEXT:    br label [[PRECOND:%.*]]
1758 // IR-GPU:       precond:
1759 // IR-GPU-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1760 // IR-GPU-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
1761 // IR-GPU-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1762 // IR-GPU:       body:
1763 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1764 // IR-GPU-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1765 // IR-GPU-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1766 // IR-GPU:       then:
1767 // IR-GPU-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1768 // IR-GPU-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
1769 // IR-GPU-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1770 // IR-GPU-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1771 // IR-GPU-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1772 // IR-GPU-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1773 // IR-GPU-NEXT:    br label [[IFCONT:%.*]]
1774 // IR-GPU:       else:
1775 // IR-GPU-NEXT:    br label [[IFCONT]]
1776 // IR-GPU:       ifcont:
1777 // IR-GPU-NEXT:    call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1778 // IR-GPU-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1779 // IR-GPU-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1780 // IR-GPU-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1781 // IR-GPU:       then2:
1782 // IR-GPU-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1783 // IR-GPU-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1784 // IR-GPU-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1785 // IR-GPU-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1786 // IR-GPU-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1787 // IR-GPU-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
1788 // IR-GPU-NEXT:    br label [[IFCONT4:%.*]]
1789 // IR-GPU:       else3:
1790 // IR-GPU-NEXT:    br label [[IFCONT4]]
1791 // IR-GPU:       ifcont4:
1792 // IR-GPU-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1793 // IR-GPU-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
1794 // IR-GPU-NEXT:    br label [[PRECOND]]
1795 // IR-GPU:       exit:
1796 // IR-GPU-NEXT:    ret void
1797 //
1798 //
1799 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
1800 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1801 // IR-GPU-NEXT:  entry:
1802 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1803 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1804 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1805 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1806 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1807 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1808 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1809 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1810 // IR-GPU-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1811 // IR-GPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1812 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1813 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1814 // IR-GPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1815 // IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1816 // IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1817 // IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1818 // IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP8]], ptr align 4 [[TMP7]], i64 400, i1 false)
1819 // IR-GPU-NEXT:    ret void
1820 //
1821 //
1822 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
1823 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1824 // IR-GPU-NEXT:  entry:
1825 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1826 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1827 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1828 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1829 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1830 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1831 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1832 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1833 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1834 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1835 // IR-GPU-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1836 // IR-GPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1837 // IR-GPU-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1838 // IR-GPU-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1839 // IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1840 // IR-GPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1841 // IR-GPU-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
1842 // IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1843 // IR-GPU-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr [[TMP7]]) #[[ATTR2]]
1844 // IR-GPU-NEXT:    ret void
1845 //
1846 //
1847 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
1848 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1849 // IR-GPU-NEXT:  entry:
1850 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1851 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1852 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1853 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1854 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1855 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1856 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1857 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1858 // IR-GPU-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1859 // IR-GPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1860 // IR-GPU-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1861 // IR-GPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1862 // IR-GPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1863 // IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1864 // IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1865 // IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1866 // IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 4 [[TMP8]], i64 400, i1 false)
1867 // IR-GPU-NEXT:    ret void
1868 //
1869 //
1870 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
1871 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1872 // IR-GPU-NEXT:  entry:
1873 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1874 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1875 // IR-GPU-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1876 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1877 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1878 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1879 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1880 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1881 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1882 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1883 // IR-GPU-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1884 // IR-GPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1885 // IR-GPU-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1886 // IR-GPU-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1887 // IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1888 // IR-GPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1889 // IR-GPU-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
1890 // IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1891 // IR-GPU-NEXT:    call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]]) #[[ATTR2]]
1892 // IR-GPU-NEXT:    ret void
1893 //
1894 //
1895 // IR-LABEL: define {{[^@]+}}@_Z3foov
1896 // IR-SAME: () #[[ATTR0:[0-9]+]] {
1897 // IR-NEXT:  entry:
1898 // IR-NEXT:    [[I:%.*]] = alloca i32, align 4
1899 // IR-NEXT:    [[J:%.*]] = alloca i32, align 4
1900 // IR-NEXT:    [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
1901 // IR-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
1902 // IR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[J]], align 4
1903 // IR-NEXT:    store i32 [[TMP0]], ptr [[J_CASTED]], align 4
1904 // IR-NEXT:    [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
1905 // IR-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
1906 // IR-NEXT:    ret i32 0
1907 //
1908 //
1909 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
1910 // IR-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
1911 // IR-NEXT:  entry:
1912 // IR-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
1913 // IR-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
1914 // IR-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
1915 // IR-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
1916 // IR-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
1917 // IR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
1918 // IR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
1919 // IR-NEXT:    store i32 [[TMP1]], ptr [[J_CASTED]], align 4
1920 // IR-NEXT:    [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
1921 // IR-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined, i64 [[TMP2]], ptr [[TMP0]])
1922 // IR-NEXT:    ret void
1923 //
1924 //
1925 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined
1926 // IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
1927 // IR-NEXT:  entry:
1928 // IR-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
1929 // IR-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
1930 // IR-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
1931 // IR-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
1932 // IR-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
1933 // IR-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
1934 // IR-NEXT:    [[TMP:%.*]] = alloca i32, align 4
1935 // IR-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
1936 // IR-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
1937 // IR-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
1938 // IR-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
1939 // IR-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
1940 // IR-NEXT:    [[J3:%.*]] = alloca i32, align 4
1941 // IR-NEXT:    [[I:%.*]] = alloca i32, align 4
1942 // IR-NEXT:    [[J4:%.*]] = alloca i32, align 4
1943 // IR-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
1944 // IR-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
1945 // IR-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
1946 // IR-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
1947 // IR-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
1948 // IR-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
1949 // IR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
1950 // IR-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
1951 // IR-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
1952 // IR-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
1953 // IR-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
1954 // IR:       omp.arrayinit.body:
1955 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
1956 // IR-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
1957 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
1958 // IR-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
1959 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
1960 // IR:       omp.arrayinit.done:
1961 // IR-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
1962 // IR-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
1963 // IR-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
1964 // IR-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
1965 // IR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
1966 // IR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
1967 // IR-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
1968 // IR-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1969 // IR-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
1970 // IR-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
1971 // IR:       cond.true:
1972 // IR-NEXT:    br label [[COND_END:%.*]]
1973 // IR:       cond.false:
1974 // IR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1975 // IR-NEXT:    br label [[COND_END]]
1976 // IR:       cond.end:
1977 // IR-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
1978 // IR-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
1979 // IR-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
1980 // IR-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
1981 // IR-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
1982 // IR:       omp.inner.for.cond:
1983 // IR-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
1984 // IR-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1985 // IR-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
1986 // IR-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
1987 // IR:       omp.inner.for.body:
1988 // IR-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
1989 // IR-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
1990 // IR-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1991 // IR-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
1992 // IR-NEXT:    [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
1993 // IR-NEXT:    store i32 [[TMP13]], ptr [[J_CASTED]], align 4
1994 // IR-NEXT:    [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
1995 // IR-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
1996 // IR-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
1997 // IR:       omp.inner.for.inc:
1998 // IR-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
1999 // IR-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
2000 // IR-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
2001 // IR-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
2002 // IR-NEXT:    br label [[OMP_INNER_FOR_COND]]
2003 // IR:       omp.inner.for.end:
2004 // IR-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
2005 // IR:       omp.loop.exit:
2006 // IR-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2007 // IR-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
2008 // IR-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
2009 // IR-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2010 // IR-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
2011 // IR-NEXT:    br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2012 // IR:       .omp.lastprivate.then:
2013 // IR-NEXT:    store i32 10, ptr [[J3]], align 4
2014 // IR-NEXT:    [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
2015 // IR-NEXT:    store i32 [[TMP21]], ptr [[J_ADDR]], align 4
2016 // IR-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
2017 // IR:       .omp.lastprivate.done:
2018 // IR-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2019 // IR-NEXT:    store ptr [[SUM1]], ptr [[TMP22]], align 8
2020 // IR-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2021 // IR-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
2022 // IR-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2023 // IR-NEXT:    switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2024 // IR-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2025 // IR-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2026 // IR-NEXT:    ]
2027 // IR:       .omp.reduction.case1:
2028 // IR-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2029 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
2030 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2031 // IR:       omp.arraycpy.body:
2032 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2033 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2034 // IR-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2035 // IR-NEXT:    [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2036 // IR-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
2037 // IR-NEXT:    store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2038 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
2039 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2040 // IR-NEXT:    [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
2041 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
2042 // IR:       omp.arraycpy.done10:
2043 // IR-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2044 // IR-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2045 // IR:       .omp.reduction.case2:
2046 // IR-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2047 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
2048 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
2049 // IR:       omp.arraycpy.body12:
2050 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2051 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2052 // IR-NEXT:    [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
2053 // IR-NEXT:    [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
2054 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
2055 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
2056 // IR-NEXT:    [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
2057 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
2058 // IR:       omp.arraycpy.done18:
2059 // IR-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2060 // IR-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2061 // IR:       .omp.reduction.default:
2062 // IR-NEXT:    ret void
2063 //
2064 //
2065 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined
2066 // IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
2067 // IR-NEXT:  entry:
2068 // IR-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
2069 // IR-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
2070 // IR-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
2071 // IR-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
2072 // IR-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
2073 // IR-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
2074 // IR-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
2075 // IR-NEXT:    [[TMP:%.*]] = alloca i32, align 4
2076 // IR-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
2077 // IR-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
2078 // IR-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
2079 // IR-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
2080 // IR-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
2081 // IR-NEXT:    [[J3:%.*]] = alloca i32, align 4
2082 // IR-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
2083 // IR-NEXT:    [[I:%.*]] = alloca i32, align 4
2084 // IR-NEXT:    [[J5:%.*]] = alloca i32, align 4
2085 // IR-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
2086 // IR-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
2087 // IR-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
2088 // IR-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2089 // IR-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2090 // IR-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
2091 // IR-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2092 // IR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2093 // IR-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
2094 // IR-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
2095 // IR-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2096 // IR-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
2097 // IR-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2098 // IR-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
2099 // IR-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
2100 // IR-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
2101 // IR-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
2102 // IR-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
2103 // IR-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
2104 // IR-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
2105 // IR-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
2106 // IR-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
2107 // IR:       omp.arrayinit.body:
2108 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
2109 // IR-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2110 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2111 // IR-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
2112 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
2113 // IR:       omp.arrayinit.done:
2114 // IR-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2115 // IR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
2116 // IR-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
2117 // IR-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2118 // IR-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
2119 // IR-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2120 // IR:       cond.true:
2121 // IR-NEXT:    br label [[COND_END:%.*]]
2122 // IR:       cond.false:
2123 // IR-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2124 // IR-NEXT:    br label [[COND_END]]
2125 // IR:       cond.end:
2126 // IR-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
2127 // IR-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
2128 // IR-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
2129 // IR-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
2130 // IR-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
2131 // IR:       omp.inner.for.cond:
2132 // IR-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
2133 // IR-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
2134 // IR-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
2135 // IR-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
2136 // IR:       omp.inner.for.body:
2137 // IR-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2138 // IR-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
2139 // IR-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
2140 // IR-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
2141 // IR-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2142 // IR-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2143 // IR-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2144 // IR-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
2145 // IR-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
2146 // IR-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
2147 // IR-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
2148 // IR-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
2149 // IR-NEXT:    store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2150 // IR-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2151 // IR-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2152 // IR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
2153 // IR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
2154 // IR-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2155 // IR-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
2156 // IR-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
2157 // IR-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2158 // IR-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
2159 // IR-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2160 // IR-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
2161 // IR:       omp.body.continue:
2162 // IR-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
2163 // IR:       omp.inner.for.inc:
2164 // IR-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2165 // IR-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
2166 // IR-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2167 // IR-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
2168 // IR:       omp.inner.for.end:
2169 // IR-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
2170 // IR:       omp.loop.exit:
2171 // IR-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2172 // IR-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
2173 // IR-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
2174 // IR-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2175 // IR-NEXT:    store ptr [[SUM4]], ptr [[TMP21]], align 8
2176 // IR-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2177 // IR-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
2178 // IR-NEXT:    [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2179 // IR-NEXT:    switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2180 // IR-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2181 // IR-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2182 // IR-NEXT:    ]
2183 // IR:       .omp.reduction.case1:
2184 // IR-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2185 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
2186 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2187 // IR:       omp.arraycpy.body:
2188 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2189 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2190 // IR-NEXT:    [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2191 // IR-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2192 // IR-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
2193 // IR-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2194 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
2195 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2196 // IR-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
2197 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
2198 // IR:       omp.arraycpy.done19:
2199 // IR-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2200 // IR-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2201 // IR:       .omp.reduction.case2:
2202 // IR-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2203 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
2204 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
2205 // IR:       omp.arraycpy.body21:
2206 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2207 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2208 // IR-NEXT:    [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
2209 // IR-NEXT:    [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
2210 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
2211 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
2212 // IR-NEXT:    [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
2213 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
2214 // IR:       omp.arraycpy.done27:
2215 // IR-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2216 // IR-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2217 // IR:       .omp.reduction.default:
2218 // IR-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2219 // IR-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
2220 // IR-NEXT:    br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2221 // IR:       .omp.lastprivate.then:
2222 // IR-NEXT:    store i32 10, ptr [[J3]], align 4
2223 // IR-NEXT:    [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
2224 // IR-NEXT:    store i32 [[TMP33]], ptr [[J_ADDR]], align 4
2225 // IR-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
2226 // IR:       .omp.lastprivate.done:
2227 // IR-NEXT:    ret void
2228 //
2229 //
2230 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func
2231 // IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
2232 // IR-NEXT:  entry:
2233 // IR-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
2234 // IR-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
2235 // IR-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2236 // IR-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2237 // IR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2238 // IR-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2239 // IR-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2240 // IR-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2241 // IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2242 // IR-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2243 // IR-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2244 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2245 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2246 // IR:       omp.arraycpy.body:
2247 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2248 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2249 // IR-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2250 // IR-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2251 // IR-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2252 // IR-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2253 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2254 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2255 // IR-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2256 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2257 // IR:       omp.arraycpy.done2:
2258 // IR-NEXT:    ret void
2259 //
2260 //
2261 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func
2262 // IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
2263 // IR-NEXT:  entry:
2264 // IR-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
2265 // IR-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
2266 // IR-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2267 // IR-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2268 // IR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2269 // IR-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2270 // IR-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2271 // IR-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2272 // IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2273 // IR-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2274 // IR-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2275 // IR-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2276 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2277 // IR:       omp.arraycpy.body:
2278 // IR-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2279 // IR-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2280 // IR-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2281 // IR-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2282 // IR-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2283 // IR-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2284 // IR-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2285 // IR-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2286 // IR-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2287 // IR-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2288 // IR:       omp.arraycpy.done2:
2289 // IR-NEXT:    ret void
2290 //
2291 //
2292 // IR-PCH-LABEL: define {{[^@]+}}@_Z3foov
2293 // IR-PCH-SAME: () #[[ATTR0:[0-9]+]] {
2294 // IR-PCH-NEXT:  entry:
2295 // IR-PCH-NEXT:    [[I:%.*]] = alloca i32, align 4
2296 // IR-PCH-NEXT:    [[J:%.*]] = alloca i32, align 4
2297 // IR-PCH-NEXT:    [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
2298 // IR-PCH-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
2299 // IR-PCH-NEXT:    [[TMP0:%.*]] = load i32, ptr [[J]], align 4
2300 // IR-PCH-NEXT:    store i32 [[TMP0]], ptr [[J_CASTED]], align 4
2301 // IR-PCH-NEXT:    [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
2302 // IR-PCH-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
2303 // IR-PCH-NEXT:    ret i32 0
2304 //
2305 //
2306 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
2307 // IR-PCH-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
2308 // IR-PCH-NEXT:  entry:
2309 // IR-PCH-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
2310 // IR-PCH-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
2311 // IR-PCH-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
2312 // IR-PCH-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
2313 // IR-PCH-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2314 // IR-PCH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2315 // IR-PCH-NEXT:    [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
2316 // IR-PCH-NEXT:    store i32 [[TMP1]], ptr [[J_CASTED]], align 4
2317 // IR-PCH-NEXT:    [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
2318 // IR-PCH-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined, i64 [[TMP2]], ptr [[TMP0]])
2319 // IR-PCH-NEXT:    ret void
2320 //
2321 //
2322 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined
2323 // IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
2324 // IR-PCH-NEXT:  entry:
2325 // IR-PCH-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
2326 // IR-PCH-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
2327 // IR-PCH-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
2328 // IR-PCH-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
2329 // IR-PCH-NEXT:    [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
2330 // IR-PCH-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
2331 // IR-PCH-NEXT:    [[TMP:%.*]] = alloca i32, align 4
2332 // IR-PCH-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
2333 // IR-PCH-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
2334 // IR-PCH-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
2335 // IR-PCH-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
2336 // IR-PCH-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
2337 // IR-PCH-NEXT:    [[J3:%.*]] = alloca i32, align 4
2338 // IR-PCH-NEXT:    [[I:%.*]] = alloca i32, align 4
2339 // IR-PCH-NEXT:    [[J4:%.*]] = alloca i32, align 4
2340 // IR-PCH-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8
2341 // IR-PCH-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
2342 // IR-PCH-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
2343 // IR-PCH-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
2344 // IR-PCH-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
2345 // IR-PCH-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2346 // IR-PCH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2347 // IR-PCH-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
2348 // IR-PCH-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
2349 // IR-PCH-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
2350 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
2351 // IR-PCH:       omp.arrayinit.body:
2352 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
2353 // IR-PCH-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2354 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2355 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
2356 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
2357 // IR-PCH:       omp.arrayinit.done:
2358 // IR-PCH-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
2359 // IR-PCH-NEXT:    store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
2360 // IR-PCH-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
2361 // IR-PCH-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
2362 // IR-PCH-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2363 // IR-PCH-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
2364 // IR-PCH-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
2365 // IR-PCH-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2366 // IR-PCH-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
2367 // IR-PCH-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2368 // IR-PCH:       cond.true:
2369 // IR-PCH-NEXT:    br label [[COND_END:%.*]]
2370 // IR-PCH:       cond.false:
2371 // IR-PCH-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2372 // IR-PCH-NEXT:    br label [[COND_END]]
2373 // IR-PCH:       cond.end:
2374 // IR-PCH-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
2375 // IR-PCH-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
2376 // IR-PCH-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
2377 // IR-PCH-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
2378 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
2379 // IR-PCH:       omp.inner.for.cond:
2380 // IR-PCH-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
2381 // IR-PCH-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2382 // IR-PCH-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
2383 // IR-PCH-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
2384 // IR-PCH:       omp.inner.for.body:
2385 // IR-PCH-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
2386 // IR-PCH-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
2387 // IR-PCH-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2388 // IR-PCH-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
2389 // IR-PCH-NEXT:    [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
2390 // IR-PCH-NEXT:    store i32 [[TMP13]], ptr [[J_CASTED]], align 4
2391 // IR-PCH-NEXT:    [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
2392 // IR-PCH-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
2393 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
2394 // IR-PCH:       omp.inner.for.inc:
2395 // IR-PCH-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
2396 // IR-PCH-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
2397 // IR-PCH-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
2398 // IR-PCH-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
2399 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND]]
2400 // IR-PCH:       omp.inner.for.end:
2401 // IR-PCH-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
2402 // IR-PCH:       omp.loop.exit:
2403 // IR-PCH-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2404 // IR-PCH-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
2405 // IR-PCH-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
2406 // IR-PCH-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2407 // IR-PCH-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
2408 // IR-PCH-NEXT:    br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2409 // IR-PCH:       .omp.lastprivate.then:
2410 // IR-PCH-NEXT:    store i32 10, ptr [[J3]], align 4
2411 // IR-PCH-NEXT:    [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
2412 // IR-PCH-NEXT:    store i32 [[TMP21]], ptr [[J_ADDR]], align 4
2413 // IR-PCH-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
2414 // IR-PCH:       .omp.lastprivate.done:
2415 // IR-PCH-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2416 // IR-PCH-NEXT:    store ptr [[SUM1]], ptr [[TMP22]], align 8
2417 // IR-PCH-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2418 // IR-PCH-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
2419 // IR-PCH-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2420 // IR-PCH-NEXT:    switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2421 // IR-PCH-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2422 // IR-PCH-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2423 // IR-PCH-NEXT:    ]
2424 // IR-PCH:       .omp.reduction.case1:
2425 // IR-PCH-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2426 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
2427 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2428 // IR-PCH:       omp.arraycpy.body:
2429 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2430 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2431 // IR-PCH-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2432 // IR-PCH-NEXT:    [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2433 // IR-PCH-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
2434 // IR-PCH-NEXT:    store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2435 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
2436 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2437 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
2438 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
2439 // IR-PCH:       omp.arraycpy.done10:
2440 // IR-PCH-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2441 // IR-PCH-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2442 // IR-PCH:       .omp.reduction.case2:
2443 // IR-PCH-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2444 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
2445 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
2446 // IR-PCH:       omp.arraycpy.body12:
2447 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2448 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2449 // IR-PCH-NEXT:    [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
2450 // IR-PCH-NEXT:    [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
2451 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
2452 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
2453 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
2454 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
2455 // IR-PCH:       omp.arraycpy.done18:
2456 // IR-PCH-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2457 // IR-PCH-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2458 // IR-PCH:       .omp.reduction.default:
2459 // IR-PCH-NEXT:    ret void
2460 //
2461 //
2462 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined
2463 // IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
2464 // IR-PCH-NEXT:  entry:
2465 // IR-PCH-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
2466 // IR-PCH-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
2467 // IR-PCH-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
2468 // IR-PCH-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
2469 // IR-PCH-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8
2470 // IR-PCH-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8
2471 // IR-PCH-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
2472 // IR-PCH-NEXT:    [[TMP:%.*]] = alloca i32, align 4
2473 // IR-PCH-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
2474 // IR-PCH-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
2475 // IR-PCH-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
2476 // IR-PCH-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
2477 // IR-PCH-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
2478 // IR-PCH-NEXT:    [[J3:%.*]] = alloca i32, align 4
2479 // IR-PCH-NEXT:    [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
2480 // IR-PCH-NEXT:    [[I:%.*]] = alloca i32, align 4
2481 // IR-PCH-NEXT:    [[J5:%.*]] = alloca i32, align 4
2482 // IR-PCH-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
2483 // IR-PCH-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
2484 // IR-PCH-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
2485 // IR-PCH-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2486 // IR-PCH-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2487 // IR-PCH-NEXT:    store i64 [[J]], ptr [[J_ADDR]], align 8
2488 // IR-PCH-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2489 // IR-PCH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2490 // IR-PCH-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
2491 // IR-PCH-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
2492 // IR-PCH-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2493 // IR-PCH-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
2494 // IR-PCH-NEXT:    [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2495 // IR-PCH-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
2496 // IR-PCH-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
2497 // IR-PCH-NEXT:    store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
2498 // IR-PCH-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
2499 // IR-PCH-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
2500 // IR-PCH-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
2501 // IR-PCH-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
2502 // IR-PCH-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
2503 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
2504 // IR-PCH:       omp.arrayinit.body:
2505 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
2506 // IR-PCH-NEXT:    store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2507 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2508 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
2509 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
2510 // IR-PCH:       omp.arrayinit.done:
2511 // IR-PCH-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2512 // IR-PCH-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
2513 // IR-PCH-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
2514 // IR-PCH-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2515 // IR-PCH-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
2516 // IR-PCH-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2517 // IR-PCH:       cond.true:
2518 // IR-PCH-NEXT:    br label [[COND_END:%.*]]
2519 // IR-PCH:       cond.false:
2520 // IR-PCH-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2521 // IR-PCH-NEXT:    br label [[COND_END]]
2522 // IR-PCH:       cond.end:
2523 // IR-PCH-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
2524 // IR-PCH-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
2525 // IR-PCH-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
2526 // IR-PCH-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
2527 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
2528 // IR-PCH:       omp.inner.for.cond:
2529 // IR-PCH-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
2530 // IR-PCH-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
2531 // IR-PCH-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
2532 // IR-PCH-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
2533 // IR-PCH:       omp.inner.for.body:
2534 // IR-PCH-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2535 // IR-PCH-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
2536 // IR-PCH-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
2537 // IR-PCH-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
2538 // IR-PCH-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2539 // IR-PCH-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2540 // IR-PCH-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2541 // IR-PCH-NEXT:    [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
2542 // IR-PCH-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
2543 // IR-PCH-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
2544 // IR-PCH-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
2545 // IR-PCH-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
2546 // IR-PCH-NEXT:    store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2547 // IR-PCH-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2548 // IR-PCH-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2549 // IR-PCH-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
2550 // IR-PCH-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
2551 // IR-PCH-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2552 // IR-PCH-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
2553 // IR-PCH-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
2554 // IR-PCH-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2555 // IR-PCH-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
2556 // IR-PCH-NEXT:    store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2557 // IR-PCH-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
2558 // IR-PCH:       omp.body.continue:
2559 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
2560 // IR-PCH:       omp.inner.for.inc:
2561 // IR-PCH-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2562 // IR-PCH-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
2563 // IR-PCH-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2564 // IR-PCH-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
2565 // IR-PCH:       omp.inner.for.end:
2566 // IR-PCH-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
2567 // IR-PCH:       omp.loop.exit:
2568 // IR-PCH-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2569 // IR-PCH-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
2570 // IR-PCH-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
2571 // IR-PCH-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2572 // IR-PCH-NEXT:    store ptr [[SUM4]], ptr [[TMP21]], align 8
2573 // IR-PCH-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2574 // IR-PCH-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
2575 // IR-PCH-NEXT:    [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2576 // IR-PCH-NEXT:    switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2577 // IR-PCH-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2578 // IR-PCH-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2579 // IR-PCH-NEXT:    ]
2580 // IR-PCH:       .omp.reduction.case1:
2581 // IR-PCH-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2582 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
2583 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2584 // IR-PCH:       omp.arraycpy.body:
2585 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2586 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2587 // IR-PCH-NEXT:    [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2588 // IR-PCH-NEXT:    [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2589 // IR-PCH-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
2590 // IR-PCH-NEXT:    store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2591 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
2592 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2593 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
2594 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
2595 // IR-PCH:       omp.arraycpy.done19:
2596 // IR-PCH-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2597 // IR-PCH-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2598 // IR-PCH:       .omp.reduction.case2:
2599 // IR-PCH-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2600 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
2601 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
2602 // IR-PCH:       omp.arraycpy.body21:
2603 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2604 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2605 // IR-PCH-NEXT:    [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
2606 // IR-PCH-NEXT:    [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
2607 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
2608 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
2609 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
2610 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
2611 // IR-PCH:       omp.arraycpy.done27:
2612 // IR-PCH-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2613 // IR-PCH-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
2614 // IR-PCH:       .omp.reduction.default:
2615 // IR-PCH-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2616 // IR-PCH-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
2617 // IR-PCH-NEXT:    br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2618 // IR-PCH:       .omp.lastprivate.then:
2619 // IR-PCH-NEXT:    store i32 10, ptr [[J3]], align 4
2620 // IR-PCH-NEXT:    [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
2621 // IR-PCH-NEXT:    store i32 [[TMP33]], ptr [[J_ADDR]], align 4
2622 // IR-PCH-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
2623 // IR-PCH:       .omp.lastprivate.done:
2624 // IR-PCH-NEXT:    ret void
2625 //
2626 //
2627 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func
2628 // IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
2629 // IR-PCH-NEXT:  entry:
2630 // IR-PCH-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
2631 // IR-PCH-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
2632 // IR-PCH-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2633 // IR-PCH-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2634 // IR-PCH-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2635 // IR-PCH-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2636 // IR-PCH-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2637 // IR-PCH-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2638 // IR-PCH-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2639 // IR-PCH-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2640 // IR-PCH-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2641 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2642 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2643 // IR-PCH:       omp.arraycpy.body:
2644 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2645 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2646 // IR-PCH-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2647 // IR-PCH-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2648 // IR-PCH-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2649 // IR-PCH-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2650 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2651 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2652 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2653 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2654 // IR-PCH:       omp.arraycpy.done2:
2655 // IR-PCH-NEXT:    ret void
2656 //
2657 //
2658 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func
2659 // IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
2660 // IR-PCH-NEXT:  entry:
2661 // IR-PCH-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
2662 // IR-PCH-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
2663 // IR-PCH-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2664 // IR-PCH-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2665 // IR-PCH-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2666 // IR-PCH-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2667 // IR-PCH-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2668 // IR-PCH-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2669 // IR-PCH-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2670 // IR-PCH-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2671 // IR-PCH-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2672 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2673 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2674 // IR-PCH:       omp.arraycpy.body:
2675 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2676 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2677 // IR-PCH-NEXT:    [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2678 // IR-PCH-NEXT:    [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2679 // IR-PCH-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2680 // IR-PCH-NEXT:    store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2681 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2682 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2683 // IR-PCH-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2684 // IR-PCH-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2685 // IR-PCH:       omp.arraycpy.done2:
2686 // IR-PCH-NEXT:    ret void
2687 //
2688