xref: /llvm-project/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir (revision 0e944a30954e666cba2bf17497fafe835e4b3519)
1// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s
2
3// 2-d parallel loop mapped to block.y and block.x
4
5func.func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index,
6                              %arg3 : index, %arg4 : index,
7                              %buf : memref<?x?xf32>,
8                              %res : memref<?x?xf32>) {
9  %step = arith.constant 2 : index
10  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
11                                          step (%arg4, %step)  {
12    %val = memref.load %buf[%i0, %i1] : memref<?x?xf32>
13    memref.store %val, %res[%i1, %i0] : memref<?x?xf32>
14  } { mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>] }
15  return
16}
17
18// CHECK:       #[[$MAP0:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
19// CHECK:       #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
20
21// CHECK:       module {
22// CHECK-LABEL:   func @parallel_loop_bidy_bidx(
23// CHECK-SAME:                                  [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: memref<?x?xf32>, [[VAL_6:%.*]]: memref<?x?xf32>) {
24// CHECK:           [[VAL_7:%.*]] = arith.constant 2 : index
25// CHECK:           [[VAL_8:%.*]] = arith.constant 1 : index
26// CHECK:           [[VAL_9:%.*]] = affine.apply #[[$MAP0]]([[VAL_2]]){{\[}}[[VAL_0]], [[VAL_4]]]
27// CHECK:           [[VAL_10:%.*]] = affine.apply #[[$MAP0]]([[VAL_3]]){{\[}}[[VAL_1]], [[VAL_7]]]
28// CHECK:           gpu.launch blocks([[VAL_11:%.*]], [[VAL_12:%.*]], [[VAL_13:%.*]]) in ([[VAL_14:%.*]] = [[VAL_10]], [[VAL_15:%.*]] = [[VAL_9]], [[VAL_16:%.*]] = [[VAL_8]]) threads([[VAL_17:%.*]], [[VAL_18:%.*]], [[VAL_19:%.*]]) in ([[VAL_20:%.*]] = [[VAL_8]], [[VAL_21:%.*]] = [[VAL_8]], [[VAL_22:%.*]] = [[VAL_8]]) {
29// CHECK:             [[VAL_23:%.*]] = affine.apply #[[$MAP1]]([[VAL_12]]){{\[}}[[VAL_4]], [[VAL_0]]]
30// CHECK:             [[VAL_24:%.*]] = affine.apply #[[$MAP1]]([[VAL_11]]){{\[}}[[VAL_7]], [[VAL_1]]]
31// CHECK:             [[VAL_25:%.*]] = memref.load [[VAL_5]]{{\[}}[[VAL_23]], [[VAL_24]]] : memref<?x?xf32>
32// CHECK:             memref.store [[VAL_25]], [[VAL_6]]{{\[}}[[VAL_24]], [[VAL_23]]] : memref<?x?xf32>
33// CHECK:             gpu.terminator
34// CHECK:           }
35// CHECK:           return
36// CHECK:         }
37// CHECK:       }
38
39// -----
40
41// tiled 2-d parallel loop mapped to block.y and block.x and thread.y and thread.x.
42
43func.func @parallel_loop_tiled(%arg0 : index, %arg1 : index, %arg2 : index,
44                        %arg3 : index,
45                        %buf : memref<?x?xf32>,
46                        %res : memref<?x?xf32>) {
47  %zero = arith.constant 0 : index
48  %one = arith.constant 1 : index
49  %four = arith.constant 4 : index
50  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
51                                          step (%four, %four)  {
52    scf.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
53                                            step (%one, %one)  {
54      %idx0 = arith.addi %i0, %si0 : index
55      %idx1 = arith.addi %i1, %si1 : index
56      %val = memref.load %buf[%idx0, %idx1] : memref<?x?xf32>
57      memref.store %val, %res[%idx1, %idx0] : memref<?x?xf32>
58    } { mapping = [
59        #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
60        #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
61     ] }
62  } { mapping = [
63      #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
64      #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
65    ] }
66  return
67}
68
69// CHECK:       #[[$MAP0:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
70// CHECK:       #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
71
72// CHECK:       module {
73// CHECK-LABEL:   func @parallel_loop_tiled(
74// CHECK-SAME:                              [[VAL_26:%.*]]: index, [[VAL_27:%.*]]: index, [[VAL_28:%.*]]: index, [[VAL_29:%.*]]: index, [[VAL_30:%.*]]: memref<?x?xf32>, [[VAL_31:%.*]]: memref<?x?xf32>) {
75// CHECK:           [[VAL_32:%.*]] = arith.constant 0 : index
76// CHECK:           [[VAL_33:%.*]] = arith.constant 1 : index
77// CHECK:           [[VAL_34:%.*]] = arith.constant 4 : index
78// CHECK:           [[VAL_35:%.*]] = arith.constant 1 : index
79// CHECK:           [[VAL_36:%.*]] = affine.apply #[[$MAP0]]([[VAL_28]]){{\[}}[[VAL_26]], [[VAL_34]]]
80// CHECK:           [[VAL_37:%.*]] = affine.apply #[[$MAP0]]([[VAL_29]]){{\[}}[[VAL_27]], [[VAL_34]]]
81// CHECK:           [[VAL_38:%.*]] = affine.apply #[[$MAP0]]([[VAL_34]]){{\[}}[[VAL_32]], [[VAL_33]]]
82// CHECK:           [[VAL_39:%.*]] = affine.apply #[[$MAP0]]([[VAL_34]]){{\[}}[[VAL_32]], [[VAL_33]]]
83// CHECK:           gpu.launch blocks([[VAL_40:%.*]], [[VAL_41:%.*]], [[VAL_42:%.*]]) in ([[VAL_43:%.*]] = [[VAL_37]], [[VAL_44:%.*]] = [[VAL_36]], [[VAL_45:%.*]] = [[VAL_35]]) threads([[VAL_46:%.*]], [[VAL_47:%.*]], [[VAL_48:%.*]]) in ([[VAL_49:%.*]] = [[VAL_39]], [[VAL_50:%.*]] = [[VAL_38]], [[VAL_51:%.*]] = [[VAL_35]]) {
84// CHECK:             [[VAL_52:%.*]] = affine.apply #[[$MAP1]]([[VAL_41]]){{\[}}[[VAL_34]], [[VAL_26]]]
85// CHECK:             [[VAL_53:%.*]] = affine.apply #[[$MAP1]]([[VAL_40]]){{\[}}[[VAL_34]], [[VAL_27]]]
86// CHECK:             [[VAL_54:%.*]] = affine.apply #[[$MAP1]]([[VAL_47]]){{\[}}[[VAL_33]], [[VAL_32]]]
87// CHECK:             [[VAL_55:%.*]] = affine.apply #[[$MAP1]]([[VAL_46]]){{\[}}[[VAL_33]], [[VAL_32]]]
88// CHECK:             [[VAL_56:%.*]] = arith.addi [[VAL_52]], [[VAL_54]] : index
89// CHECK:             [[VAL_57:%.*]] = arith.addi [[VAL_53]], [[VAL_55]] : index
90// CHECK:             [[VAL_58:%.*]] = memref.load [[VAL_30]]{{\[}}[[VAL_56]], [[VAL_57]]] : memref<?x?xf32>
91// CHECK:             memref.store [[VAL_58]], [[VAL_31]]{{\[}}[[VAL_57]], [[VAL_56]]] : memref<?x?xf32>
92// CHECK:             gpu.terminator
93// CHECK:           }
94// CHECK:           return
95// CHECK:         }
96// CHECK:       }
97
98// -----
99
100// 2-d parallel loop mapped to block.y and sequential
101
102func.func @parallel_loop_bidy_seq(%arg0 : index, %arg1 : index, %arg2 : index,
103                             %arg3 : index, %arg4 : index,
104                             %buf : memref<?x?xf32>,
105                             %res : memref<?x?xf32>) {
106  %step = arith.constant 2 : index
107  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
108                                          step (%arg4, %step)  {
109    %val = memref.load %buf[%i0, %i1] : memref<?x?xf32>
110    memref.store %val, %res[%i1, %i0] : memref<?x?xf32>
111  } { mapping = [
112      #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
113      #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>
114    ] }
115  return
116}
117
118// CHECK:       #[[$MAP0:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
119// CHECK:       #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
120
121// CHECK:       module {
122// CHECK-LABEL:   func @parallel_loop_bidy_seq(
123// CHECK-SAME:                                 [[VAL_59:%.*]]: index, [[VAL_60:%.*]]: index, [[VAL_61:%.*]]: index, [[VAL_62:%.*]]: index, [[VAL_63:%.*]]: index, [[VAL_64:%.*]]: memref<?x?xf32>, [[VAL_65:%.*]]: memref<?x?xf32>) {
124// CHECK:           [[VAL_66:%.*]] = arith.constant 2 : index
125// CHECK:           [[VAL_67:%.*]] = arith.constant 1 : index
126// CHECK:           [[VAL_68:%.*]] = affine.apply #[[$MAP0]]([[VAL_61]]){{\[}}[[VAL_59]], [[VAL_63]]]
127// CHECK:           gpu.launch blocks([[VAL_69:%.*]], [[VAL_70:%.*]], [[VAL_71:%.*]]) in ([[VAL_72:%.*]] = [[VAL_67]], [[VAL_73:%.*]] = [[VAL_68]], [[VAL_74:%.*]] = [[VAL_67]]) threads([[VAL_75:%.*]], [[VAL_76:%.*]], [[VAL_77:%.*]]) in ([[VAL_78:%.*]] = [[VAL_67]], [[VAL_79:%.*]] = [[VAL_67]], [[VAL_80:%.*]] = [[VAL_67]]) {
128// CHECK:             [[VAL_81:%.*]] = affine.apply #[[$MAP1]]([[VAL_70]]){{\[}}[[VAL_63]], [[VAL_59]]]
129// CHECK:             scf.for [[VAL_82:%.*]] = [[VAL_60]] to [[VAL_62]] step [[VAL_66]] {
130// CHECK:               [[VAL_83:%.*]] = memref.load [[VAL_64]]{{\[}}[[VAL_81]], [[VAL_82]]] : memref<?x?xf32>
131// CHECK:               memref.store [[VAL_83]], [[VAL_65]]{{\[}}[[VAL_82]], [[VAL_81]]] : memref<?x?xf32>
132// CHECK:             }
133// CHECK:             gpu.terminator
134// CHECK:           }
135// CHECK:           return
136// CHECK:         }
137// CHECK:       }
138
139// -----
140
141// tiled 2-d parallel loop mapped to block.y and seq. and thread.y and seq.
142
143func.func @parallel_loop_tiled_seq(%arg0 : index, %arg1 : index, %arg2 : index,
144                              %arg3 : index,
145                              %buf : memref<?x?xf32>,
146                              %res : memref<?x?xf32>) {
147  %zero = arith.constant 0 : index
148  %one = arith.constant 1 : index
149  %four = arith.constant 4 : index
150  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
151                                          step (%four, %four)  {
152    scf.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
153                                            step (%one, %one)  {
154      %idx0 = arith.addi %i0, %si0 : index
155      %idx1 = arith.addi %i1, %si1 : index
156      %val = memref.load %buf[%idx0, %idx1] : memref<?x?xf32>
157      memref.store %val, %res[%idx1, %idx0] : memref<?x?xf32>
158    } { mapping = [
159        #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
160        #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>
161      ] }
162  } { mapping = [
163      #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
164      #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>
165    ] }
166  return
167}
168
169// CHECK:       #[[$MAP0:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
170// CHECK:       #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
171
172// CHECK:       module {
173// CHECK-LABEL:   func @parallel_loop_tiled_seq(
174// CHECK-SAME:                                  [[VAL_84:%.*]]: index, [[VAL_85:%.*]]: index, [[VAL_86:%.*]]: index, [[VAL_87:%.*]]: index, [[VAL_88:%.*]]: memref<?x?xf32>, [[VAL_89:%.*]]: memref<?x?xf32>) {
175// CHECK:           [[VAL_90:%.*]] = arith.constant 0 : index
176// CHECK:           [[VAL_91:%.*]] = arith.constant 1 : index
177// CHECK:           [[VAL_92:%.*]] = arith.constant 4 : index
178// CHECK:           [[VAL_93:%.*]] = arith.constant 1 : index
179// CHECK:           [[VAL_94:%.*]] = affine.apply #[[$MAP0]]([[VAL_86]]){{\[}}[[VAL_84]], [[VAL_92]]]
180// CHECK:           [[VAL_95:%.*]] = affine.apply #[[$MAP0]]([[VAL_92]]){{\[}}[[VAL_90]], [[VAL_91]]]
181// CHECK:           gpu.launch blocks([[VAL_96:%.*]], [[VAL_97:%.*]], [[VAL_98:%.*]]) in ([[VAL_99:%.*]] = [[VAL_93]], [[VAL_100:%.*]] = [[VAL_94]], [[VAL_101:%.*]] = [[VAL_93]]) threads([[VAL_102:%.*]], [[VAL_103:%.*]], [[VAL_104:%.*]]) in ([[VAL_105:%.*]] = [[VAL_93]], [[VAL_106:%.*]] = [[VAL_95]], [[VAL_107:%.*]] = [[VAL_93]]) {
182// CHECK:             [[VAL_108:%.*]] = affine.apply #[[$MAP1]]([[VAL_97]]){{\[}}[[VAL_92]], [[VAL_84]]]
183// CHECK:             scf.for [[VAL_109:%.*]] = [[VAL_85]] to [[VAL_87]] step [[VAL_92]] {
184// CHECK:               [[VAL_110:%.*]] = affine.apply #[[$MAP1]]([[VAL_103]]){{\[}}[[VAL_91]], [[VAL_90]]]
185// CHECK:               scf.for [[VAL_111:%.*]] = [[VAL_90]] to [[VAL_92]] step [[VAL_91]] {
186// CHECK:                 [[VAL_112:%.*]] = arith.addi [[VAL_108]], [[VAL_110]] : index
187// CHECK:                 [[VAL_113:%.*]] = arith.addi [[VAL_109]], [[VAL_111]] : index
188// CHECK:                 [[VAL_114:%.*]] = memref.load [[VAL_88]]{{\[}}[[VAL_112]], [[VAL_113]]] : memref<?x?xf32>
189// CHECK:                 memref.store [[VAL_114]], [[VAL_89]]{{\[}}[[VAL_113]], [[VAL_112]]] : memref<?x?xf32>
190// CHECK:               }
191// CHECK:             }
192// CHECK:             gpu.terminator
193// CHECK:           }
194// CHECK:           return
195// CHECK:         }
196// CHECK:       }
197
198// -----
199
200#map1 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
201#map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)>
202
203module {
204  func.func @sum(%arg0: memref<?x?xf32, strided<[?, 1], offset: ?>>, %arg1: memref<?x?xf32, strided<[?, 1], offset: ?>>, %arg2: memref<?x?xf32, strided<[?, 1], offset: ?>>) {
205    %c1 = arith.constant 1 : index
206    %c0 = arith.constant 0 : index
207    %c3 = arith.constant 3 : index
208    %c2 = arith.constant 2 : index
209    %0 = memref.dim %arg0, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
210    %1 = memref.dim %arg0, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
211    scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c2, %c3) {
212      %2 = memref.dim %arg0, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
213      %3 = affine.min #map1(%arg3)[%2]
214      %squared_min = arith.muli %3, %3 : index
215      %4 = memref.dim %arg0, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
216      %d = arith.subi %4, %arg4 : index
217      %5 = arith.minsi %c3, %d : index
218      %6 = memref.subview %arg0[%arg3, %arg4][%squared_min, %5][%c1, %c1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>>
219      %7 = memref.dim %arg1, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
220      %8 = affine.min #map1(%arg3)[%7]
221      %9 = memref.dim %arg1, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
222      %10 = affine.min #map2(%arg4)[%9]
223      %11 = memref.subview %arg1[%arg3, %arg4][%8, %10][%c1, %c1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>>
224      %12 = memref.dim %arg2, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
225      %13 = affine.min #map1(%arg3)[%12]
226      %14 = memref.dim %arg2, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
227      %15 = affine.min #map2(%arg4)[%14]
228      %16 = memref.subview %arg2[%arg3, %arg4][%13, %15][%c1, %c1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>>
229      scf.parallel (%arg5, %arg6) = (%c0, %c0) to (%squared_min, %5) step (%c1, %c1) {
230        %17 = memref.load %6[%arg5, %arg6] : memref<?x?xf32, strided<[?, ?], offset: ?>>
231        %18 = memref.load %11[%arg5, %arg6] : memref<?x?xf32, strided<[?, ?], offset: ?>>
232        %19 = memref.load %16[%arg5, %arg6] : memref<?x?xf32, strided<[?, ?], offset: ?>>
233        %20 = arith.addf %17, %18 : f32
234        memref.store %20, %16[%arg5, %arg6] : memref<?x?xf32, strided<[?, ?], offset: ?>>
235        scf.reduce
236      } {mapping = [#gpu.loop_dim_map<bound = (d0) -> (d0), map = (d0) -> (d0), processor = thread_x>, #gpu.loop_dim_map<bound = (d0) -> (d0), map = (d0) -> (d0), processor = thread_y>]}
237      scf.reduce
238    } {mapping = [#gpu.loop_dim_map<bound = (d0) -> (d0), map = (d0) -> (d0), processor = block_x>, #gpu.loop_dim_map<bound = (d0) -> (d0), map = (d0) -> (d0), processor = block_y>]}
239    return
240  }
241}
242
243// CHECK-DAG:       #[[$MAP1:.*]] = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
244// CHECK-DAG:       #[[$MAP2:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
245// CHECK-DAG:       #[[$MAP3:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)>
246// CHECK-DAG:       #[[$MAP4:.*]] = affine_map<(d0)[s0] -> (3, -d0 + s0)>
247
248// CHECK:       module {
249// CHECK-LABEL:   func @sum(
250// CHECK-SAME:              [[VAL_0:%.*]]: memref<?x?xf32, strided<[?, 1], offset: ?>>, [[VAL_1:%.*]]: memref<?x?xf32, strided<[?, 1], offset: ?>>, [[VAL_2:%.*]]: memref<?x?xf32, strided<[?, 1], offset: ?>>) {
251// CHECK:           %[[C1:.*]] = arith.constant 1 : index
252// CHECK:           %[[C0:.*]] = arith.constant 0 : index
253// CHECK:           %[[C3:.*]] = arith.constant 3 : index
254// CHECK:           %[[C2:.*]] = arith.constant 2 : index
255// CHECK:           [[VAL_7:%.*]] = memref.dim [[VAL_0]], %[[C0]] : memref<?x?xf32, strided<[?, 1], offset: ?>>
256// CHECK:           [[VAL_8:%.*]] = memref.dim [[VAL_0]], %[[C1]] : memref<?x?xf32, strided<[?, 1], offset: ?>>
257// CHECK:           [[VAL_9:%.*]] = arith.constant 1 : index
258// CHECK:           [[VAL_10:%.*]] = affine.apply #[[$MAP1]]([[VAL_7]]){{\[}}%[[C0]], %[[C2]]]
259// CHECK:           [[VAL_11:%.*]] = affine.apply #[[$MAP1]]([[VAL_8]]){{\[}}%[[C0]], %[[C3]]]
260// CHECK:           [[VAL_12:%.*]] = arith.constant 4 : index
261// CHECK:           [[VAL_13:%.*]] = affine.apply #[[$MAP1]]([[VAL_12]]){{\[}}%[[C0]], %[[C1]]]
262// CHECK:           [[VAL_15:%.*]] = affine.apply #[[$MAP1]](%[[C3]]){{\[}}%[[C0]], %[[C1]]]
263// CHECK:           gpu.launch blocks([[VAL_16:%.*]], [[VAL_17:%.*]], [[VAL_18:%.*]]) in ([[VAL_19:%.*]] = [[VAL_10]], [[VAL_20:%.*]] = [[VAL_11]], [[VAL_21:%.*]] = [[VAL_9]]) threads([[VAL_22:%.*]], [[VAL_23:%.*]], [[VAL_24:%.*]]) in ([[VAL_25:%.*]] = [[VAL_13]], [[VAL_26:%.*]] = [[VAL_15]], [[VAL_27:%.*]] = [[VAL_9]]) {
264// CHECK:             [[VAL_28:%.*]] = affine.apply #[[$MAP2]]([[VAL_16]]){{\[}}%[[C2]], %[[C0]]]
265// CHECK:             [[VAL_29:%.*]] = affine.apply #[[$MAP2]]([[VAL_17]]){{\[}}%[[C3]], %[[C0]]]
266// CHECK:             [[VAL_30:%.*]] = memref.dim [[VAL_0]], %[[C0]] : memref<?x?xf32, strided<[?, 1], offset: ?>>
267// CHECK:             [[VAL_31:%.*]] = affine.min #[[$MAP3]]([[VAL_28]]){{\[}}[[VAL_30]]]
268// CHECK:             [[VAL_31_SQUARED:%.*]] = arith.muli [[VAL_31]], [[VAL_31]] : index
269// CHECK:             [[VAL_32:%.*]] = memref.dim [[VAL_0]], %[[C1]] : memref<?x?xf32, strided<[?, 1], offset: ?>>
270// CHECK:             [[VAL_D:%.*]] = arith.subi [[VAL_32]], [[VAL_29]] : index
271// CHECK:             [[VAL_33:%.*]] = arith.minsi %[[C3]], [[VAL_D]] : index
272// CHECK:             [[VAL_34:%.*]] = memref.subview [[VAL_0]]{{\[}}[[VAL_28]], [[VAL_29]]] {{\[}}[[VAL_31_SQUARED]], [[VAL_33]]] {{\[}}%[[C1]], %[[C1]]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>>
273// CHECK:             [[VAL_35:%.*]] = memref.dim [[VAL_1]], %[[C0]] : memref<?x?xf32, strided<[?, 1], offset: ?>>
274// CHECK:             [[VAL_36:%.*]] = affine.min #[[$MAP3]]([[VAL_28]]){{\[}}[[VAL_35]]]
275// CHECK:             [[VAL_37:%.*]] = memref.dim [[VAL_1]], %[[C1]] : memref<?x?xf32, strided<[?, 1], offset: ?>>
276// CHECK:             [[VAL_38:%.*]] = affine.min #[[$MAP4]]([[VAL_29]]){{\[}}[[VAL_37]]]
277// CHECK:             [[VAL_39:%.*]] = memref.subview [[VAL_1]]{{\[}}[[VAL_28]], [[VAL_29]]] {{\[}}[[VAL_36]], [[VAL_38]]] {{\[}}%[[C1]], %[[C1]]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>>
278// CHECK:             [[VAL_40:%.*]] = memref.dim [[VAL_2]], %[[C0]] : memref<?x?xf32, strided<[?, 1], offset: ?>>
279// CHECK:             [[VAL_41:%.*]] = affine.min #[[$MAP3]]([[VAL_28]]){{\[}}[[VAL_40]]]
280// CHECK:             [[VAL_42:%.*]] = memref.dim [[VAL_2]], %[[C1]] : memref<?x?xf32, strided<[?, 1], offset: ?>>
281// CHECK:             [[VAL_43:%.*]] = affine.min #[[$MAP4]]([[VAL_29]]){{\[}}[[VAL_42]]]
282// CHECK:             [[VAL_44:%.*]] = memref.subview [[VAL_2]]{{\[}}[[VAL_28]], [[VAL_29]]] {{\[}}[[VAL_41]], [[VAL_43]]] {{\[}}%[[C1]], %[[C1]]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, ?], offset: ?>>
283// CHECK:             [[VAL_45:%.*]] = affine.apply #[[$MAP2]]([[VAL_22]]){{\[}}%[[C1]], %[[C0]]]
284// CHECK:             [[VAL_46:%.*]] = arith.cmpi slt, [[VAL_45]], [[VAL_31_SQUARED]] : index
285// CHECK:             scf.if [[VAL_46]] {
286// CHECK:               [[VAL_47:%.*]] = affine.apply #[[$MAP2]]([[VAL_23]]){{\[}}%[[C1]], %[[C0]]]
287// CHECK:               [[VAL_48:%.*]] = arith.cmpi slt, [[VAL_47]], [[VAL_33]] : index
288// CHECK:               scf.if [[VAL_48]] {
289// CHECK:                 [[VAL_49:%.*]] = memref.load [[VAL_34]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, strided<[?, ?], offset: ?>>
290// CHECK:                 [[VAL_50:%.*]] = memref.load [[VAL_39]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, strided<[?, ?], offset: ?>>
291// CHECK:                 [[VAL_51:%.*]] = memref.load [[VAL_44]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, strided<[?, ?], offset: ?>>
292// CHECK:                 [[VAL_52:%.*]] = arith.addf [[VAL_49]], [[VAL_50]] : f32
293// CHECK:                 memref.store [[VAL_52]], [[VAL_44]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, strided<[?, ?], offset: ?>>
294// CHECK:               }
295// CHECK:             }
296// CHECK:             gpu.terminator
297// CHECK:           }
298// CHECK:           return
299// CHECK:         }
300// CHECK:       }
301
302// -----
303
304// Optional attribute lowering test
305
306func.func @parallel_loop_optional_attr() {
307  %c0 = arith.constant 0 : index
308  %c1 = arith.constant 1 : index
309  scf.parallel (%i0) = (%c0) to (%c1) step (%c1) {
310  } { mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>], optional_attr = 1 }
311  // CHECK: optional_attr = 1
312  return
313}
314
315// -----
316
317// Mapping to the same processor twice. Cannot be mapped.
318
319func.func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
320                          %arg3 : index,
321                          %buf : memref<?x?xf32>,
322                          %res : memref<?x?xf32>) {
323  %four = arith.constant 4 : index
324  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
325                                          step (%four, %four)  {
326  } { mapping = [
327      #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
328      #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>
329    ] }
330  return
331}
332
333// CHECK-LABEL: @parallel_double_map
334// CHECK: scf.parallel
335
336// -----
337
338// Loop with loop-variant upper bound. Cannot be mapped.
339
340func.func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
341                                       %arg3 : index,
342                                       %buf : memref<?x?xf32>,
343                                       %res : memref<?x?xf32>) {
344  %zero = arith.constant 0 : index
345  %one = arith.constant 1 : index
346  %four = arith.constant 4 : index
347  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
348                                          step (%four, %four)  {
349    scf.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
350                                            step (%one, %one)  {
351      %idx0 = arith.addi %i0, %si0 : index
352      %idx1 = arith.addi %i1, %si1 : index
353      %val = memref.load %buf[%idx0, %idx1] : memref<?x?xf32>
354      memref.store %val, %res[%idx1, %idx0] : memref<?x?xf32>
355    } { mapping = [
356        #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
357        #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>
358      ] }
359  } { mapping = [
360      #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
361      #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>
362    ] }
363  return
364}
365
366// CHECK-LABEL: @parallel_loop_loop_variant_bound
367// CHECK: scf.parallel
368// CHECK: scf.parallel
369
370// -----
371
372// Loop without annotations. Cannot be mapped.
373
374func.func @parallel_no_annotations(%arg0 : index, %arg1 : index, %arg2 : index,
375                              %arg3 : index,
376                              %buf : memref<?x?xf32>,
377                              %res : memref<?x?xf32>) {
378  %four = arith.constant 4 : index
379  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
380                                          step (%four, %four)  {
381  }
382  return
383}
384
385// CHECK-LABEL: @parallel_no_annotations
386// CHECK: scf.parallel
387
388// -----
389
390// CHECK-LABEL: @step_invariant
391func.func @step_invariant() {
392  %alloc = memref.alloc() : memref<1x1xf64>
393  %alloc_0 = memref.alloc() : memref<1x1xf64>
394  %alloc_1 = memref.alloc() : memref<1x1xf64>
395  %c0 = arith.constant 0 : index
396  %c1 = arith.constant 1 : index
397  %c1_2 = arith.constant 1 : index
398  scf.parallel (%arg0) = (%c0) to (%c1) step (%c1_2) {
399    %c0_3 = arith.constant 0 : index
400    %c1_4 = arith.constant 1 : index
401    %c1_5 = arith.constant 1 : index
402    scf.parallel (%arg1) = (%c0_3) to (%c1_4) step (%c1_5) {
403      %0 = memref.load %alloc_1[%arg0, %arg1] : memref<1x1xf64>
404      %1 = memref.load %alloc_0[%arg0, %arg1] : memref<1x1xf64>
405      %2 = arith.addf %0, %1 : f64
406      memref.store %2, %alloc[%arg0, %arg1] : memref<1x1xf64>
407      scf.reduce
408    } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
409    scf.reduce
410  } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
411  memref.dealloc %alloc_1 : memref<1x1xf64>
412  memref.dealloc %alloc_0 : memref<1x1xf64>
413  memref.dealloc %alloc : memref<1x1xf64>
414  return
415}
416
417// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<1x1xf64>
418// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<1x1xf64>
419// CHECK: %[[alloc_2:.*]] = memref.alloc() : memref<1x1xf64>
420// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
421// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
422// CHECK: gpu.launch
423// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
424// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
425// CHECK: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
426// CHECK: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
427// CHECK: %[[lhs:.*]] = memref.load %[[alloc_2]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>
428// CHECK: %[[rhs:.*]] = memref.load %[[alloc_1]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>
429// CHECK: %[[sum:.*]] = arith.addf %[[lhs]], %[[rhs]] : f64
430// CHECK: memref.store %[[sum]], %[[alloc_0]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>
431
432// -----
433
434// 1-d parallel reduction mapped to block.x and thread.x.
435
436// CHECK-LABEL: @parallel_reduction_1d
437func.func @parallel_reduction_1d() {
438  %alloc = memref.alloc() : memref<f32>
439  %alloc_0 = memref.alloc() : memref<64xf32>
440  %c1 = arith.constant 1 : index
441  %c64 = arith.constant 64 : index
442  %c0 = arith.constant 0 : index
443  %cst = arith.constant 0.000000e+00 : f32
444  scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) {
445    %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 {
446      %1 = memref.load %alloc_0[%arg2] : memref<64xf32>
447      scf.reduce(%1 : f32) {
448      ^bb0(%arg3: f32, %arg4: f32):
449        %2 = arith.addf %arg3, %arg4 : f32
450        scf.reduce.return %2 : f32
451      }
452    } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
453    memref.store %0, %alloc[] : memref<f32>
454    scf.reduce
455  } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
456  memref.dealloc %alloc : memref<f32>
457  memref.dealloc %alloc_0 : memref<64xf32>
458  return
459}
460
461// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<f32>
462// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32>
463// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
464// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
465// CHECK: gpu.launch
466// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
467// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
468// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
469// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
470// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]]] : memref<64xf32>
471// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] {
472// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32):
473// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32
474// CHECK-NEXT: gpu.yield %[[sum]] : f32
475// CHECK-NEXT: } : (f32) -> f32
476// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref<f32>
477
478// -----
479
480// 2-d parallel reduction mapped to block.x and thread.x and thread.y.
481
482// CHECK-LABEL: @parallel_reduction_2d
483func.func @parallel_reduction_2d() {
484  %alloc = memref.alloc() : memref<f32>
485  %alloc_0 = memref.alloc() : memref<8x8xf32>
486  %c1 = arith.constant 1 : index
487  %c8 = arith.constant 8 : index
488  %c0 = arith.constant 0 : index
489  %cst = arith.constant 0.000000e+00 : f32
490  scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) {
491    %0 = scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c8, %c8) step (%c1, %c1) init (%cst) -> f32 {
492      %1 = memref.load %alloc_0[%arg2, %arg3] : memref<8x8xf32>
493      scf.reduce(%1 : f32) {
494      ^bb0(%arg4: f32, %arg5: f32):
495        %2 = arith.addf %arg4, %arg5 : f32
496        scf.reduce.return %2 : f32
497      }
498    } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
499    memref.store %0, %alloc[] : memref<f32>
500    scf.reduce
501  } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
502  memref.dealloc %alloc : memref<f32>
503  memref.dealloc %alloc_0 : memref<8x8xf32>
504  return
505}
506
507// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<f32>
508// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<8x8xf32>
509// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
510// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
511// CHECK: %[[map_2:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
512// CHECK: gpu.launch
513// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
514// CHECK-SAME: threads(%[[arg_3:.*]], %[[arg_4:.*]], %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %[[map_2]], %{{[^)]*}} = %{{[^)]*}})
515// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
516// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
517// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_4]])[{{.*}}, {{.*}}]
518// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]], %[[dim2]]] : memref<8x8xf32>
519// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] {
520// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32):
521// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32
522// CHECK-NEXT: gpu.yield %[[sum]] : f32
523// CHECK-NEXT: } : (f32) -> f32
524// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref<f32>
525
526// -----
527
528// tiled 1-d parallel reduction mapped to block.x and thread.x.
529
530// CHECK-LABEL: @parallel_reduction_1d_tiled
531func.func @parallel_reduction_1d_tiled() {
532  %c128 = arith.constant 128 : index
533  %c1 = arith.constant 1 : index
534  %c64 = arith.constant 64 : index
535  %c0 = arith.constant 0 : index
536  %cst = arith.constant 0.000000e+00 : f32
537  %alloc_0 = memref.alloc() : memref<8192xf32>
538  %alloc_1 = memref.alloc() : memref<64xf32>
539  scf.parallel (%arg1) = (%c0) to (%c64) step (%c1) {
540    %subview = memref.subview %alloc_1[%arg1] [1] [1] : memref<64xf32> to memref<f32, strided<[], offset: ?>>
541    %0 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
542    %subview_1 = memref.subview %alloc_0[%0] [128] [1] : memref<8192xf32> to memref<128xf32, strided<[1], offset: ?>>
543    %1 = scf.parallel (%arg2) = (%c0) to (%c128) step (%c1) init (%cst) -> f32 {
544      %2 = memref.load %subview_1[%arg2] : memref<128xf32, strided<[1], offset: ?>>
545      scf.reduce(%2 : f32) {
546      ^bb0(%arg3: f32, %arg4: f32):
547        %3 = arith.addf %arg3, %arg4 : f32
548        scf.reduce.return %3 : f32
549      }
550    } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
551    memref.store %1, %subview[] : memref<f32, strided<[], offset: ?>>
552    scf.reduce
553  } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
554  memref.dealloc %alloc_0 : memref<8192xf32>
555  memref.dealloc %alloc_1 : memref<64xf32>
556  return
557}
558
559// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<8192xf32>
560// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32>
561// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
562// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
563// CHECK: gpu.launch
564// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
565// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
566// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
567// CHECK-NEXT: %[[dst:.*]] = memref.subview %[[alloc_1]][%[[dim0]]] [1] [1] : memref<64xf32>
568// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map2(%[[dim0]])
569// CHECK-NEXT: %[[tile:.*]] = memref.subview %[[alloc_0]][%[[dim1]]] [128] [1] : memref<8192xf32>
570// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
571// CHECK-NEXT: %[[src:.*]] = memref.load %[[tile]][%[[dim2]]] : memref<128xf32, strided<[1], offset: ?>>
572// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] {
573// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32):
574// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32
575// CHECK-NEXT: gpu.yield %[[sum]] : f32
576// CHECK-NEXT: } : (f32) -> f32
577// CHECK-NEXT: memref.store %[[res]], %[[dst]][] : memref<f32, strided<[], offset: ?>>
578
579// -----
580
581// 1-d parallel reduction, unsigned int. Cannot be mapped.
582
583// CHECK-LABEL: @parallel_reduction_1d_uint
584func.func @parallel_reduction_1d_uint(%cst : ui32) {
585  %alloc = memref.alloc() : memref<ui32>
586  %alloc_0 = memref.alloc() : memref<64xui32>
587  %c1 = arith.constant 1 : index
588  %c64 = arith.constant 64 : index
589  %c0 = arith.constant 0 : index
590  scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) {
591    %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> ui32 {
592      %1 = memref.load %alloc_0[%arg2] : memref<64xui32>
593      scf.reduce(%1 : ui32) {
594      ^bb0(%arg3: ui32, %arg4: ui32):
595        scf.reduce.return %arg3 : ui32
596      }
597    } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
598    memref.store %0, %alloc[] : memref<ui32>
599    scf.reduce
600  } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
601  memref.dealloc %alloc : memref<ui32>
602  memref.dealloc %alloc_0 : memref<64xui32>
603  return
604}
605
606// CHECK: scf.parallel
607// CHECK-NEXT: scf.parallel
608// CHECK: scf.reduce
609
610// -----
611
612// 1-d parallel reduction, not isolated from above. Cannot be mapped.
613
614// CHECK-LABEL: @parallel_reduction_1d_outside
615func.func @parallel_reduction_1d_outside() {
616  %alloc = memref.alloc() : memref<f32>
617  %alloc_0 = memref.alloc() : memref<64xf32>
618  %c1 = arith.constant 1 : index
619  %c64 = arith.constant 64 : index
620  %c0 = arith.constant 0 : index
621  %cst = arith.constant 0.000000e+00 : f32
622  %const = arith.constant 1.000000e+00 : f32
623  scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) {
624    %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 {
625      %1 = memref.load %alloc_0[%arg2] : memref<64xf32>
626      scf.reduce(%1 : f32) {
627      ^bb0(%arg3: f32, %arg4: f32):
628        %2 = arith.addf %arg3, %arg4 : f32
629        %3 = arith.addf %2, %const : f32
630        scf.reduce.return %3 : f32
631      }
632    } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
633    memref.store %0, %alloc[] : memref<f32>
634    scf.reduce
635  } {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
636  memref.dealloc %alloc : memref<f32>
637  memref.dealloc %alloc_0 : memref<64xf32>
638  return
639}
640
641// CHECK: scf.parallel
642// CHECK-NEXT: scf.parallel
643// CHECK: scf.reduce
644